Ejemplo n.º 1
0
  def __rescheduleFailedJob( self, jobID, message, stop = True ):
    """
    Set Job Status to "Rescheduled" and issue a reschedule command to the Job Manager
    """

    self.log.warn( 'Failure during %s' % ( message ) )

    jobManager = RPCClient( 'WorkloadManagement/JobManager' )
    jobReport = JobReport( int( jobID ), 'JobAgent@%s' % self.siteName )

    #Setting a job parameter does not help since the job will be rescheduled,
    #instead set the status with the cause and then another status showing the
    #reschedule operation.

    jobReport.setJobStatus( status = 'Rescheduled',
                            application = message,
                            sendFlag = True )

    self.log.info( 'Job will be rescheduled' )
    result = jobManager.rescheduleJob( jobID )
    if not result['OK']:
      self.log.error( result['Message'] )
      return self.__finish( 'Problem Rescheduling Job', stop )

    self.log.info( 'Job Rescheduled %s' % ( jobID ) )
    return self.__finish( 'Job Rescheduled', stop )
Ejemplo n.º 2
0
def rescheduleFailedJob(jobID,message):
  try:
    import DIRAC
    global jobReport

    gLogger.warn('Failure during %s' %(message))

    #Setting a job parameter does not help since the job will be rescheduled,
    #instead set the status with the cause and then another status showing the
    #reschedule operation.

    if not jobReport:
      gLogger.info('Creating a new JobReport Object')
      jobReport = JobReport(int(jobID),'JobWrapperTemplate')

    jobReport.setApplicationStatus( 'Failed %s ' % message, sendFlag = False )
    jobReport.setJobStatus( 'Rescheduled', message, sendFlag = False )

    # We must send Job States and Parameters before it gets reschedule
    jobReport.sendStoredStatusInfo()
    jobReport.sendStoredJobParameters()

    gLogger.info('Job will be rescheduled after exception during execution of the JobWrapper')

    jobManager  = RPCClient('WorkloadManagement/JobManager')
    result = jobManager.rescheduleJob(int(jobID))
    if not result['OK']:
      gLogger.warn(result)

    # Send mail to debug errors
    mailAddress = DIRAC.alarmMail
    site        = DIRAC.siteName()
    subject     = 'Job rescheduled at %s' % site
    ret         = systemCall(0,'hostname')
    wn          = ret['Value'][1]
    msg         = 'Job %s rescheduled at %s, wn=%s\n' % ( jobID, site, wn )
    msg        += message

    NotificationClient().sendMail(mailAddress,subject,msg,fromAddress="*****@*****.**",localAttempt=False)

    return
  except Exception,x:
    gLogger.exception('JobWrapperTemplate failed to reschedule Job')
    return
Ejemplo n.º 3
0
  def execute(self):
    """The JobAgent execution method.
    """
    if self.jobCount:
      # Temporary mechanism to pass a shutdown message to the agent
      if os.path.exists('/var/lib/dirac_drain'):
        return self.__finish('Node is being drained by an operator')
      # Only call timeLeft utility after a job has been picked up
      self.log.info('Attempting to check CPU time left for filling mode')
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn(self.timeLeftError)
          return self.__finish(self.timeLeftError)
        self.log.info('%s normalized CPU units remaining in slot' % (self.timeLeft))
        if self.timeLeft <= self.minimumTimeLeft:
          return self.__finish('No more time left')
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
        if not result['OK']:
          return self.__finish(result['Message'])

        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join('.', self.extraOptions)
        else:
          localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
        localCfg.loadFromFile(localConfigFile)
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
        localCfg.writeToFile(localConfigFile)

      else:
        return self.__finish('Filling Mode is Disabled')

    self.log.verbose('Job Agent execution loop')
    result = self.computingElement.available()
    if not result['OK']:
      self.log.info('Resource is not available')
      self.log.info(result['Message'])
      return self.__finish('CE Not Available')

    self.log.info(result['Message'])

    ceInfoDict = result['CEInfoDict']
    runningJobs = ceInfoDict.get("RunningJobs")
    availableSlots = result['Value']

    if not availableSlots:
      if runningJobs:
        self.log.info('No available slots with %d running jobs' % runningJobs)
        return S_OK('Job Agent cycle complete with %d running jobs' % runningJobs)
      else:
        self.log.info('CE is not available')
        return self.__finish('CE Not Available')

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    # Add pilot information
    gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if 'PilotReference' not in ceDict:
      ceDict['PilotReference'] = str(self.pilotReference)
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict('/AgentJobRequirements')
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update(requirementsDict)
      self.log.info('Requirements:', requirementsDict)

    self.log.verbose(ceDict)
    start = time.time()
    jobRequest = MatcherClient().requestJob(ceDict)
    matchTime = time.time() - start
    self.log.info('MatcherTime = %.2f (s)' % (matchTime))

    self.stopAfterFailedMatches = self.am_getOption('StopAfterFailedMatches', self.stopAfterFailedMatches)

    if not jobRequest['OK']:
      if re.search('No match found', jobRequest['Message']):
        self.log.notice('Job request OK: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("seconds timeout") != -1:
        self.log.error('Timeout while requesting job', jobRequest['Message'])
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])
      elif jobRequest['Message'].find("Pilot version does not match") != -1:
        errorMsg = 'Pilot version does not match the production version'
        self.log.error(errorMsg, jobRequest['Message'].replace(errorMsg, ''))
        return S_ERROR(jobRequest['Message'])
      else:
        self.log.notice('Failed to get jobs: %s' % (jobRequest['Message']))
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish('Nothing to do for more than %d cycles' % self.stopAfterFailedMatches)
        return S_OK(jobRequest['Message'])

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag', False)
    jobID = matcherInfo['JobID']
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if param not in matcherInfo:
        self.__report(jobID, 'Failed', 'Matcher did not return %s' % (param))
        return self.__finish('Matcher Failed')
      elif not matcherInfo[param]:
        self.__report(jobID, 'Failed', 'Matcher returned null %s' % (param))
        return self.__finish('Matcher Failed')
      else:
        self.log.verbose('Matcher returned %s = %s ' % (param, matcherInfo[param]))

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo:
      if key not in matcherParams:
        optimizerParams[key] = matcherInfo[key]

    parameters = self.__getJDLParameters(jobJDL)
    if not parameters['OK']:
      self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
      self.log.warn(parameters['Message'])
      return self.__finish('JDL Problem')

    params = parameters['Value']
    if 'JobID' not in params:
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report(jobID, 'Failed', msg)
      self.log.warn(msg)
      return self.__finish('JDL Problem')
    else:
      jobID = params['JobID']

    if 'JobType' not in params:
      self.log.warn('Job has no JobType defined in JDL parameters')
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if 'CPUTime' not in params:
      self.log.warn('Job has no CPU requirement defined in JDL parameters')

    # Job requirement for a number of processors
    processors = int(params.get('NumberOfProcessors', 1))
    wholeNode = 'WholeNode' in params

    if self.extraOptions:
      params['Arguments'] += ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose('Job request successful: \n', jobRequest['Value'])
    self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
    self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
    self.jobCount += 1
    try:
      jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
      jobReport.setJobParameter('MatcherServiceTime', str(matchTime), sendFlag=False)

      if 'BOINC_JOB_ID' in os.environ:
        # Report BOINC environment
        for thisp in ('BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName'):
          jobReport.setJobParameter(thisp, gConfig.getValue('/LocalSite/%s' % thisp, 'Unknown'), sendFlag=False)

      jobReport.setJobStatus('Matched', 'Job Received by Agent')
      result = self.__setupProxy(ownerDN, jobGroup)
      if not result['OK']:
        return self.__rescheduleFailedJob(jobID, result['Message'], self.stopOnApplicationFailure)
      proxyChain = result.get('Value')

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest(jobID, jobJDL)

      software = self.__checkInstallSoftware(jobID, params, ceDict)
      if not software['OK']:
        self.log.error('Failed to install software for job', '%s' % (jobID))
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

      self.log.debug('Before %sCE submitJob()' % (self.ceName))
      result = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain, processors, wholeNode)
      if not result['OK']:
        self.__report(jobID, 'Failed', result['Message'])
        return self.__finish(result['Message'])
      elif 'PayloadFailed' in result:
        # Do not keep running and do not overwrite the Payload error
        message = 'Payload execution failed with error code %s' % result['PayloadFailed']
        if self.stopOnApplicationFailure:
          return self.__finish(message, self.stopOnApplicationFailure)
        else:
          self.log.info(message)

      self.log.debug('After %sCE submitJob()' % (self.ceName))
    except Exception as subExcept:  # pylint: disable=broad-except
      self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
      return self.__rescheduleFailedJob(jobID, 'Job processing failed with exception', self.stopOnApplicationFailure)

    # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
    cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

    result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        # if the batch system is not defined, use the process time and the CPU normalization defined locally
        self.timeLeft = self.__getCPUTimeLeft()

    return S_OK('Job Agent cycle complete')
Ejemplo n.º 4
0
def main():

    from DIRAC.Core.Base import Script
    ### DoCtaIrf options ##########################################################
    Script.registerSwitch("A:", "analysis=", "Analysis Type", setAnalysisType)
    Script.registerSwitch("C:", "cuts=", "Cuts Config", setCutsConfig)
    Script.registerSwitch("R:", "runlist=", "Runlist", setRunlist)
    Script.registerSwitch("Z:", "zenith=", "Zenith", setZenith)
    Script.registerSwitch("O:", "offset=", "Offset", setOffset)
    Script.registerSwitch("M:", "energy=", "Energy Method", setEnergyMethod)
    Script.registerSwitch("T:", "arrayconfig=", "Array Configuration",
                          setArrayConfig)
    Script.registerSwitch("P:", "particle=", "Particle Type", setParticleType)
    ## other options
    Script.registerSwitch("V:", "version=", "HAP version", setVersion)

    Script.parseCommandLine(ignoreErrors=True)

    args = Script.getPositionalArgs()
    if len(args) < 1:
        Script.showHelp()

    from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication
    from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
    from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
    from DIRAC.Core.Utilities.Subprocess import systemCall
    from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

    jobID = os.environ['JOBID']
    jobID = int(jobID)
    jobReport = JobReport(jobID)

    ha = HapApplication()

    HapPack = 'HAP/' + version + '/HAP'

    packs = ['HESS/v0.2/lib', 'HESS/v0.3/root', HapPack]

    for package in packs:
        DIRAC.gLogger.notice('Checking:', package)
        if sharedArea:
            if checkSoftwarePackage(package, sharedArea())['OK']:
                DIRAC.gLogger.notice('Package found in Shared Area:', package)
                continue
        if localArea:
            if checkSoftwarePackage(package, localArea())['OK']:
                DIRAC.gLogger.notice('Package found in Local Area:', package)
                continue
            if installSoftwarePackage(package, localArea())['OK']:
                continue
        DIRAC.gLogger.error('Check Failed for software package:', package)
        DIRAC.gLogger.error('Software package not available')
        DIRAC.exit(-1)

    ha.setSoftwarePackage(HapPack)

    ha.hapExecutable = 'DoCtaIrf'

    runlistdir = os.environ['PWD']

    build_infile(runlist)

    ha.hapArguments = [
        analysistype, cutsconfig, runlistdir, runlist, zenith, offset,
        arrayconfig, energymethod, particle
    ]

    DIRAC.gLogger.notice('Executing Hap Application')
    res = ha.execute()

    if not res['OK']:
        DIRAC.gLogger.error('Failed to execute Hap Application')
        jobReport.setApplicationStatus('Hap Application: Failed')
        DIRAC.exit(-1)

    DIRAC.exit()
Ejemplo n.º 5
0
def main():

    from DIRAC.Core.Base import Script

    Script.registerSwitch("p:", "run_number=", "Run Number", setRunNumber)
    Script.registerSwitch("T:", "template=", "Template", setCorsikaTemplate)
    Script.registerSwitch("E:", "executable=", "Executable", setExecutable)
    Script.registerSwitch("S:", "simtelConfig=", "SimtelConfig", setConfig)
    Script.registerSwitch("V:", "version=", "Version", setVersion)
    Script.registerSwitch("M:", "mode=", "Mode", setMode)
    Script.registerSwitch("C:", "savecorsika=", "Save Corsika", setSaveCorsika)

    from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
    from DIRAC.Resources.Catalog.FileCatalog import FileCatalog

    Script.parseCommandLine()
    global fcc, fcL, storage_element

    from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
    from CTADIRAC.Core.Workflow.Modules.CorsikaApp import CorsikaApp
    from CTADIRAC.Core.Workflow.Modules.Read_CtaApp import Read_CtaApp
    from DIRAC.Core.Utilities.Subprocess import systemCall

    jobID = os.environ['JOBID']
    jobID = int(jobID)
    global jobReport
    jobReport = JobReport(jobID)

    ###########
    ## Checking MD coherence
    fc = FileCatalog('LcgFileCatalog')
    res = fc._getCatalogConfigDetails('DIRACFileCatalog')
    print 'DFC CatalogConfigDetails:', res
    res = fc._getCatalogConfigDetails('LcgFileCatalog')
    print 'LCG CatalogConfigDetails:', res

    fcc = FileCatalogClient()
    fcL = FileCatalog('LcgFileCatalog')

    from DIRAC.Interfaces.API.Dirac import Dirac
    dirac = Dirac()

    #############
    simtelConfigFilesPath = 'sim_telarray/multi'
    simtelConfigFile = simtelConfigFilesPath + '/multi_cta-ultra5.cfg'
    #simtelConfigFile = simtelConfigFilesPath + '/multi_cta-prod1s.cfg'
    createGlobalsFromConfigFiles('prodConfigFile', corsikaTemplate, version)

    ######################Building prod Directory Metadata #######################
    resultCreateProdDirMD = createProdFileSystAndMD()
    if not resultCreateProdDirMD['OK']:
        DIRAC.gLogger.error('Failed to create prod Directory MD')
        jobReport.setApplicationStatus('Failed to create prod Directory MD')
        DIRAC.gLogger.error('Metadata coherence problem, no file produced')
        DIRAC.exit(-1)
    else:
        print 'prod Directory MD successfully created'

    ######################Building corsika Directory Metadata #######################

    resultCreateCorsikaDirMD = createCorsikaFileSystAndMD()
    if not resultCreateCorsikaDirMD['OK']:
        DIRAC.gLogger.error('Failed to create corsika Directory MD')
        jobReport.setApplicationStatus('Failed to create corsika Directory MD')
        DIRAC.gLogger.error(
            'Metadata coherence problem, no corsikaFile produced')
        DIRAC.exit(-1)
    else:
        print 'corsika Directory MD successfully created'

    ############ Producing Corsika File
    global CorsikaSimtelPack
    CorsikaSimtelPack = os.path.join('corsika_simhessarray', version,
                                     'corsika_simhessarray')
    install_CorsikaSimtelPack(version, 'sim')
    cs = CorsikaApp()
    cs.setSoftwarePackage(CorsikaSimtelPack)
    cs.csExe = executable
    cs.csArguments = [
        '--run-number', run_number, '--run', 'corsika', corsikaTemplate
    ]
    corsikaReturnCode = cs.execute()

    if corsikaReturnCode != 0:
        DIRAC.gLogger.error('Corsika Application: Failed')
        jobReport.setApplicationStatus('Corsika Application: Failed')
        DIRAC.exit(-1)
###################### rename of corsika output file #######################
    rundir = 'run' + run_number
    filein = rundir + '/' + corsikaOutputFileName
    corsikaFileName = particle + '_' + thetaP + '_' + phiP + '_alt' + obslev + '_' + 'run' + run_number + '.corsika.gz'
    mv_cmd = 'mv ' + filein + ' ' + corsikaFileName
    if (os.system(mv_cmd)):
        DIRAC.exit(-1)
########################

########################
## files spread in 1000-runs subDirectories
    runNum = int(run_number)
    subRunNumber = '%03d' % runNum
    runNumModMille = runNum % 1000
    runNumTrunc = (runNum - runNumModMille) / 1000
    runNumSeriesDir = '%03dxxx' % runNumTrunc
    print 'runNumSeriesDir=', runNumSeriesDir

    ### create corsika tar luisa ####################
    corsikaTarName = particle + '_' + thetaP + '_' + phiP + '_alt' + obslev + '_' + 'run' + run_number + '.corsika.tar.gz'
    filetar1 = rundir + '/' + 'input'
    filetar2 = rundir + '/' + 'DAT' + run_number + '.dbase'
    filetar3 = rundir + '/run' + str(int(run_number)) + '.log'
    cmdTuple = [
        '/bin/tar', 'zcf', corsikaTarName, filetar1, filetar2, filetar3
    ]
    DIRAC.gLogger.notice('Executing command tuple:', cmdTuple)
    ret = systemCall(0, cmdTuple, sendOutput)
    if not ret['OK']:
        DIRAC.gLogger.error('Failed to execute tar')
        DIRAC.exit(-1)

######################################################
    corsikaOutFileDir = os.path.join(corsikaDirPath, particle, 'Data',
                                     runNumSeriesDir)
    corsikaOutFileLFN = os.path.join(corsikaOutFileDir, corsikaFileName)
    corsikaRunNumberSeriesDirExist = fcc.isDirectory(
        corsikaOutFileDir)['Value']['Successful'][corsikaOutFileDir]
    newCorsikaRunNumberSeriesDir = (
        corsikaRunNumberSeriesDirExist != True
    )  # if new runFileSeries, will need to add new MD

    #### create a file to DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK ################
    f = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK', 'w')
    f.close()

    if savecorsika == 'True':
        DIRAC.gLogger.notice('Put and register corsika File in LFC and DFC:',
                             corsikaOutFileLFN)
        ret = dirac.addFile(corsikaOutFileLFN, corsikaFileName,
                            storage_element)

        res = CheckCatalogCoherence(corsikaOutFileLFN)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Job failed: Catalog Coherence problem found')
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Error during addFile call:', ret['Message'])
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

    # put and register corsikaTarFile:
        corsikaTarFileDir = os.path.join(corsikaDirPath, particle, 'Log',
                                         runNumSeriesDir)
        corsikaTarFileLFN = os.path.join(corsikaTarFileDir, corsikaTarName)

        ##### If storage element is IN2P3-tape save simtel file on disk ###############
        if storage_element == 'CC-IN2P3-Tape':
            storage_element = 'CC-IN2P3-Disk'

        DIRAC.gLogger.notice(
            'Put and register corsikaTar File in LFC and DFC:',
            corsikaTarFileLFN)
        ret = dirac.addFile(corsikaTarFileLFN, corsikaTarName, storage_element)

        ####Checking and restablishing catalog coherence #####################
        res = CheckCatalogCoherence(corsikaTarFileLFN)
        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Job failed: Catalog Coherence problem found')
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Error during addFile call:', ret['Message'])
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)
######################################################################

        if newCorsikaRunNumberSeriesDir:
            insertRunFileSeriesMD(corsikaOutFileDir, runNumTrunc)
            insertRunFileSeriesMD(corsikaTarFileDir, runNumTrunc)

###### insert corsika File Level metadata ############################################
        corsikaFileMD = {}
        corsikaFileMD['runNumber'] = int(run_number)
        corsikaFileMD['jobID'] = jobID
        corsikaFileMD['corsikaReturnCode'] = corsikaReturnCode
        corsikaFileMD['nbShowers'] = nbShowers

        result = fcc.setMetadata(corsikaOutFileLFN, corsikaFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(corsikaTarFileLFN, corsikaFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

#####  Exit now if only corsika simulation required
    if (mode == 'corsika_standalone'):
        DIRAC.exit()

############ Producing SimTel File
######################Building simtel Directory Metadata #######################

    cfg_dict = {
        "4MSST": 'cta-prod2-4m-dc',
        "SCSST": 'cta-prod2-sc-sst',
        "STD": 'cta-prod2',
        "NSBX3": 'cta-prod2',
        "ASTRI": 'cta-prod2-astri',
        "SCMST": 'cta-prod2-sc3',
        "NORTH": 'cta-prod2n'
    }

    if simtelConfig == "6INROW":
        all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD", "SCMST"]
    elif simtelConfig == "5INROW":
        all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD"]
    elif simtelConfig == "3INROW":
        all_configs = ["SCSST", "STD", "SCMST"]
    else:
        all_configs = [simtelConfig]

############################################
#for current_conf in all_configs:
#DIRAC.gLogger.notice('current conf is',current_conf)
#if current_conf == "SCMST":
#current_version = version + '_sc3'
#DIRAC.gLogger.notice('current version is', current_version)
#if os.path.isdir('sim_telarray'):
#DIRAC.gLogger.notice('Package found in the local area. Removing package...')
#cmd = 'rm -R sim_telarray corsika-6990 hessioxxx corsika-run'
#if(os.system(cmd)):
#DIRAC.exit( -1 )
#install_CorsikaSimtelPack(current_version)
#else:
#current_version = version
#DIRAC.gLogger.notice('current version is', current_version)
#############################################################

    for current_conf in all_configs:
        DIRAC.gLogger.notice('current conf is', current_conf)
        if current_conf == "SCMST":
            current_version = version + '_sc3'
            DIRAC.gLogger.notice('current version is', current_version)
            installSoftwareEnviron(CorsikaSimtelPack, workingArea(), 'sim-sc3')
        else:
            current_version = version
            DIRAC.gLogger.notice('current version is', current_version)

########################################################

        global simtelDirPath
        global simtelProdVersion

        simtelProdVersion = current_version + '_simtel'
        simtelDirPath = os.path.join(corsikaParticleDirPath, simtelProdVersion)

        resultCreateSimtelDirMD = createSimtelFileSystAndMD(current_conf)
        if not resultCreateSimtelDirMD['OK']:
            DIRAC.gLogger.error('Failed to create simtelArray Directory MD')
            jobReport.setApplicationStatus(
                'Failed to create simtelArray Directory MD')
            DIRAC.gLogger.error(
                'Metadata coherence problem, no simtelArray File produced')
            DIRAC.exit(-1)
        else:
            DIRAC.gLogger.notice('simtel Directory MD successfully created')

############## check simtel data file LFN exists ########################
        simtelFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.simtel.gz'
        simtelDirPath_conf = simtelDirPath + '_' + current_conf
        simtelOutFileDir = os.path.join(simtelDirPath_conf, 'Data',
                                        runNumSeriesDir)
        simtelOutFileLFN = os.path.join(simtelOutFileDir, simtelFileName)
        res = CheckCatalogCoherence(simtelOutFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Current conf already done', current_conf)
            continue

#### execute simtelarray ################
        fd = open('run_sim.sh', 'w')
        fd.write("""#! /bin/sh  
source ./Corsika_simhessarrayEnv.sh
export SVNPROD2=$PWD
export SVNTAG=SVN-PROD2_rev10503
export CORSIKA_IO_BUFFER=800MB
cp ../grid_prod2-repro.sh .
ln -s ../%s
ln -s ../$SVNTAG
./grid_prod2-repro.sh %s %s""" %
                 (corsikaFileName, corsikaFileName, current_conf))
        fd.close()
        ####################################

        os.system('chmod u+x run_sim.sh')
        cmdTuple = ['./run_sim.sh']
        ret = systemCall(0, cmdTuple, sendOutputSimTel)
        simtelReturnCode, stdout, stderr = ret['Value']

        if (os.system('grep Broken simtel.log') == 0):
            DIRAC.gLogger.error('Broken string found in simtel.log')
            jobReport.setApplicationStatus('Broken pipe')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Failed to execute run_sim.sh')
            DIRAC.gLogger.error('run_sim.sh status is:', simtelReturnCode)
            DIRAC.exit(-1)

##   check simtel data/log/histo Output File exist
        cfg = cfg_dict[current_conf]
        #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Data/*.simtel.gz ' + simtelFileName
        if current_conf == "SCMST":
            cmdprefix = 'mv sim-sc3/Data/sim_telarray/' + cfg + '/0.0deg/'
        else:
            cmdprefix = 'mv sim/Data/sim_telarray/' + cfg + '/0.0deg/'

        cmd = cmdprefix + 'Data/*' + cfg + '_*.simtel.gz ' + simtelFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)

############################################
        simtelRunNumberSeriesDirExist = fcc.isDirectory(
            simtelOutFileDir)['Value']['Successful'][simtelOutFileDir]
        newSimtelRunFileSeriesDir = (
            simtelRunNumberSeriesDirExist != True
        )  # if new runFileSeries, will need to add new MD

        simtelLogFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(obslev) + '_' + 'run' + run_number + '.log.gz'
        #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Log/*.log.gz ' + simtelLogFileName
        cmd = cmdprefix + 'Log/*' + cfg + '_*.log.gz ' + simtelLogFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)
        simtelOutLogFileDir = os.path.join(simtelDirPath_conf, 'Log',
                                           runNumSeriesDir)
        simtelOutLogFileLFN = os.path.join(simtelOutLogFileDir,
                                           simtelLogFileName)

        simtelHistFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.hdata.gz'
        #cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Histograms/*.hdata.gz ' + simtelHistFileName
        cmd = cmdprefix + 'Histograms/*' + cfg + '_*.hdata.gz ' + simtelHistFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)
        simtelOutHistFileDir = os.path.join(simtelDirPath_conf, 'Histograms',
                                            runNumSeriesDir)
        simtelOutHistFileLFN = os.path.join(simtelOutHistFileDir,
                                            simtelHistFileName)

        ########### quality check on Histo #############################################
        fd = open('check_histo.sh', 'w')
        fd.write("""#! /bin/sh  
nsim=$(list_histograms %s|fgrep 'Histogram 6 '|sed 's/^.*contents: //'| sed 's:/.*$::')
nevents=%d
if [ $nsim -lt $(( $nevents - 20 )) ]; then
echo 'nsim found:' $nsim
echo 'nsim expected:' $nevents
exit 1
else
echo 'nsim found:' $nsim
echo 'nsim expected:' $nevents
fi
""" % (simtelHistFileName, int(nbShowers) * int(cscat)))
        fd.close()

        ret = getSoftwareEnviron(CorsikaSimtelPack)

        if not ret['OK']:
            error = ret['Message']
            DIRAC.gLogger.error(error, CorsikaSimtelPack)
            DIRAC.exit(-1)

        corsikaEnviron = ret['Value']

        os.system('chmod u+x check_histo.sh')
        cmdTuple = ['./check_histo.sh']
        DIRAC.gLogger.notice('Executing command tuple:', cmdTuple)
        ret = systemCall(0, cmdTuple, sendOutput, env=corsikaEnviron)
        checkHistoReturnCode, stdout, stderr = ret['Value']

        if not ret['OK']:
            DIRAC.gLogger.error('Failed to execute check_histo.sh')
            DIRAC.gLogger.error('check_histo.sh status is:',
                                checkHistoReturnCode)
            DIRAC.exit(-1)

        if (checkHistoReturnCode != 0):
            DIRAC.gLogger.error('Failure during check_histo.sh')
            DIRAC.gLogger.error('check_histo.sh status is:',
                                checkHistoReturnCode)
            jobReport.setApplicationStatus('Histo check Failed')
            DIRAC.exit(-1)

########## quality check on Log #############################
        cmd = 'zcat %s | grep Finished.' % simtelLogFileName
        DIRAC.gLogger.notice('Executing system call:', cmd)
        if (os.system(cmd)):
            jobReport.setApplicationStatus('Log check Failed')
            DIRAC.exit(-1)

################################################
        from DIRAC.Core.Utilities import List
        from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
        opsHelper = Operations()

        global seList
        seList = opsHelper.getValue('ProductionOutputs/SimtelProd', [])
        seList = List.randomize(seList)

        DIRAC.gLogger.notice('SeList is:', seList)

        #########  Upload simtel data/log/histo ##############################################

        res = upload_to_seList(simtelOutFileLFN, simtelFileName)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('OutputData Upload Error', simtelOutFileLFN)
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        res = CheckCatalogCoherence(simtelOutLogFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Log file already exists. Removing:',
                                 simtelOutLogFileLFN)
            ret = dirac.removeFile(simtelOutLogFileLFN)

        res = upload_to_seList(simtelOutLogFileLFN, simtelLogFileName)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Upload simtel Log Error', simtelOutLogFileLFN)
            DIRAC.gLogger.notice('Removing simtel data file:',
                                 simtelOutFileLFN)
            ret = dirac.removeFile(simtelOutFileLFN)
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        res = CheckCatalogCoherence(simtelOutHistFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Histo file already exists. Removing:',
                                 simtelOutHistFileLFN)
            ret = dirac.removeFile(simtelOutHistFileLFN)

        res = upload_to_seList(simtelOutHistFileLFN, simtelHistFileName)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Upload simtel Histo Error',
                                simtelOutHistFileLFN)
            DIRAC.gLogger.notice('Removing simtel data file:',
                                 simtelOutFileLFN)
            ret = dirac.removeFile(simtelOutFileLFN)
            DIRAC.gLogger.notice('Removing simtel log file:',
                                 simtelOutLogFileLFN)
            ret = dirac.removeFile(simtelOutLogFileLFN)
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

#    simtelRunNumberSeriesDirExist = fcc.isDirectory(simtelOutFileDir)['Value']['Successful'][simtelOutFileDir]
#    newSimtelRunFileSeriesDir = (simtelRunNumberSeriesDirExist != True)  # if new runFileSeries, will need to add new MD

        if newSimtelRunFileSeriesDir:
            print 'insertRunFileSeriesMD'
            insertRunFileSeriesMD(simtelOutFileDir, runNumTrunc)
            insertRunFileSeriesMD(simtelOutLogFileDir, runNumTrunc)
            insertRunFileSeriesMD(simtelOutHistFileDir, runNumTrunc)
        else:
            print 'NotinsertRunFileSeriesMD'

###### simtel File level metadata ############################################
        simtelFileMD = {}
        simtelFileMD['runNumber'] = int(run_number)
        simtelFileMD['jobID'] = jobID
        simtelFileMD['simtelReturnCode'] = simtelReturnCode

        result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(simtelOutLogFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(simtelOutHistFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        if savecorsika == 'True':
            result = fcc.addFileAncestors(
                {simtelOutFileLFN: {
                    'Ancestors': [corsikaOutFileLFN]
                }})
            print 'result addFileAncestor:', result

            result = fcc.addFileAncestors(
                {simtelOutLogFileLFN: {
                    'Ancestors': [corsikaOutFileLFN]
                }})
            print 'result addFileAncestor:', result

            result = fcc.addFileAncestors(
                {simtelOutHistFileLFN: {
                    'Ancestors': [corsikaOutFileLFN]
                }})
            print 'result addFileAncestor:', result

#####  Exit now if only corsika simulation required
        if (mode == 'corsika_simtel'):
            continue

######### run read_cta #######################################

        rcta = Read_CtaApp()
        rcta.setSoftwarePackage(CorsikaSimtelPack)
        rcta.rctaExe = 'read_cta'

        powerlaw_dict = {
            'gamma': '-2.57',
            'gamma_ptsrc': '-2.57',
            'proton': '-2.70',
            'electron': '-3.21'
        }
        dstFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.simtel-dst0.gz'
        dstHistoFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.hdata-dst0.gz'

        ## added some options starting from Armazones_2K prod.
        rcta.rctaArguments = [
            '-r', '4', '-u', '--integration-scheme', '4',
            '--integration-window', '7,3', '--tail-cuts', '6,8', '--min-pix',
            '2', '--min-amp', '20', '--type', '1,0,0,400', '--tail-cuts',
            '9,12', '--min-amp', '20', '--type', '2,0,0,100', '--tail-cuts',
            '8,11', '--min-amp', '19', '--type', '3,0,0,40', '--tail-cuts',
            '6,9', '--min-amp', '15', '--type', '4,0,0,15', '--tail-cuts',
            '3.7,5.5', '--min-amp', '8', '--type', '5,0,0,70,5.6',
            '--tail-cuts', '2.4,3.2', '--min-amp', '5.6', '--dst-level', '0',
            '--dst-file', dstFileName, '--histogram-file', dstHistoFileName,
            '--powerlaw', powerlaw_dict[particle], simtelFileName
        ]

        rctaReturnCode = rcta.execute()

        if rctaReturnCode != 0:
            DIRAC.gLogger.error('read_cta Application: Failed')
            jobReport.setApplicationStatus('read_cta Application: Failed')
            DIRAC.exit(-1)

######## run dst quality checks ######################################

        fd = open('check_dst_histo.sh', 'w')
        fd.write("""#! /bin/sh  
dsthistfilename=%s
dstfile=%s
n6="$(list_histograms -h 6 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" 
n12001="$(list_histograms -h 12001 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" 
if [ $n6 -ne $n12001 ]; then
echo 'n6 found:' $n6
echo 'n12001 found:' $n12001
exit 1
else
echo 'n6 found:' $n6
echo 'n12001 found:' $n12001
fi

n12002="$(list_histograms -h 12002 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')" 
nev="$(statio ${dstfile} | egrep '^2010' | cut -f2)"
if [ -z "$nev" ]; then nev="0"; fi

if [ $nev -ne $n12002 ]; then
echo 'nev found:' $nev
echo 'n12002 found:' $n12002
exit 1
else
echo 'nev found:' $nev
echo 'n12002 found:' $n12002
fi
""" % (dstHistoFileName, dstFileName))
        fd.close()

        os.system('chmod u+x check_dst_histo.sh')
        cmdTuple = ['./check_dst_histo.sh']
        DIRAC.gLogger.notice('Executing command tuple:', cmdTuple)
        ret = systemCall(0, cmdTuple, sendOutput, env=corsikaEnviron)
        checkHistoReturnCode, stdout, stderr = ret['Value']

        if not ret['OK']:
            DIRAC.gLogger.error('Failed to execute check_dst_histo.sh')
            DIRAC.gLogger.error('check_dst_histo.sh status is:',
                                checkHistoReturnCode)
            DIRAC.exit(-1)

        if (checkHistoReturnCode != 0):
            DIRAC.gLogger.error('Failure during check_dst_histo.sh')
            DIRAC.gLogger.error('check_dst_histo.sh status is:',
                                checkHistoReturnCode)
            jobReport.setApplicationStatus('Histo check Failed')
            DIRAC.exit(-1)

############create MD and upload dst data/histo ##########################################################

        global dstDirPath
        global dstProdVersion

        dstProdVersion = current_version + '_dst'
        dstDirPath = os.path.join(simtelDirPath_conf, dstProdVersion)

        dstOutFileDir = os.path.join(dstDirPath, 'Data', runNumSeriesDir)
        dstOutFileLFN = os.path.join(dstOutFileDir, dstFileName)

        resultCreateDstDirMD = createDstFileSystAndMD()
        if not resultCreateDstDirMD['OK']:
            DIRAC.gLogger.error('Failed to create Dst Directory MD')
            jobReport.setApplicationStatus('Failed to create Dst Directory MD')
            DIRAC.gLogger.error(
                'Metadata coherence problem, no Dst File produced')
            DIRAC.exit(-1)
        else:
            DIRAC.gLogger.notice('Dst Directory MD successfully created')
############################################################

        res = CheckCatalogCoherence(dstOutFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('dst file already exists. Removing:',
                                 dstOutFileLFN)
            ret = dirac.removeFile(dstOutFileLFN)

        res = upload_to_seList(dstOutFileLFN, dstFileName)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Upload dst Error', dstOutFileLFN)
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

##############################################################
        dstHistoFileDir = os.path.join(dstDirPath, 'Histograms',
                                       runNumSeriesDir)
        dstHistoFileLFN = os.path.join(dstHistoFileDir, dstHistoFileName)

        res = CheckCatalogCoherence(dstHistoFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('dst histo file already exists. Removing:',
                                 dstHistoFileLFN)
            ret = dirac.removeFile(dstHistoFileLFN)

        res = upload_to_seList(dstHistoFileLFN, dstHistoFileName)

        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Upload dst Error', dstHistoFileName)
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

########### Insert RunNumSeries MD ##########################

        dstRunNumberSeriesDirExist = fcc.isDirectory(
            dstOutFileDir)['Value']['Successful'][dstOutFileDir]
        newDstRunFileSeriesDir = (
            dstRunNumberSeriesDirExist != True
        )  # if new runFileSeries, will need to add new MD

        if newDstRunFileSeriesDir:
            insertRunFileSeriesMD(dstOutFileDir, runNumTrunc)
            insertRunFileSeriesMD(dstHistoFileDir, runNumTrunc)

####### dst File level metadata ###############################################
        dstFileMD = {}
        dstFileMD['runNumber'] = int(run_number)
        dstFileMD['jobID'] = jobID
        dstFileMD['rctaReturnCode'] = rctaReturnCode

        result = fcc.setMetadata(dstOutFileLFN, dstFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(dstHistoFileLFN, dstFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

########## set the ancestors for dst #####################################

        result = fcc.addFileAncestors(
            {dstOutFileLFN: {
                'Ancestors': [simtelOutFileLFN]
            }})
        print 'result addFileAncestor:', result

        result = fcc.addFileAncestors(
            {dstHistoFileLFN: {
                'Ancestors': [simtelOutFileLFN]
            }})
        print 'result addFileAncestor:', result


######################################################

    DIRAC.exit()
Ejemplo n.º 6
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Temporary mechanism to pass a shutdown message to the agent
            if os.path.exists('/var/lib/dirac_drain'):
                return self.__finish('Node is being drained by an operator')
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(
                        "Disabling filling mode as errors calculating time left",
                        self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              '%d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            else:
                self.log.info('CE is not available')
                return self.__finish('CE Not Available')

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions:
            params['Arguments'] = (params.get('Arguments', '') + ' ' +
                                   self.extraOptions).strip()
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(thisp,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self._setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self._rescheduleFailedJob(jobID, result['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result = self._submitJob(jobID, params, ceDict, optimizerParams,
                                     proxyChain, processors, wholeNode,
                                     maxNumberOfProcessors, mpTag)
            if not result['OK']:
                return self.__finish(result['Message'])
            elif 'PayloadFailed' in result:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime, processors)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self._getCPUTimeLeft()

        return S_OK('Job Agent cycle complete')
Ejemplo n.º 7
0
def execute( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception as e:
    gLogger.exception( 'JobWrapper failed the initialization phase', lException = e )
    rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    try:
      job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' )
    except Exception as e:
      gLogger.exception( 'JobWrapper failed sending job accounting', lException = e )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport )
      job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception as x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport )
        job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
  else:
    gLogger.verbose( 'Job has no InputData requirement' )

  gJobReport.commit()

  try:
    result = job.execute( arguments )
    if not result['OK']:
      gLogger.error( 'Failed to execute job', result['Message'] )
      raise JobWrapperError( result['Message'] )
  except Exception as x:
    if str( x ) == '0':
      gLogger.verbose( 'JobWrapper exited with status=0 after execution' )
    else:
      gLogger.exception( 'Job failed in execution phase' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Exception During Execution' )
      return 1

  if arguments['Job'].has_key( 'OutputSandbox' ) or arguments['Job'].has_key( 'OutputData' ):
    try:
      result = job.processJobOutputs( arguments )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception as x:
      gLogger.exception( 'JobWrapper failed to process output files' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Uploading Job Outputs', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Uploading Job Outputs' )
      return 2
  else:
    gLogger.verbose( 'Job has no OutputData or OutputSandbox requirement' )

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the finalization phase' )
    return 2
Ejemplo n.º 8
0
    def execute(self):
        """The JobAgent execution method."""
        self.log.verbose("Job Agent execution loop")

        queueDictItems = list(self.queueDict.items())
        random.shuffle(queueDictItems)

        # Check that there is enough slots locally
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"] or result["Value"]:
            return result

        for queueName, queueDictionary in queueDictItems:

            # Make sure there is no problem with the queue before trying to submit
            if not self._allowedToSubmit(queueName):
                continue

            # Get a working proxy
            ce = queueDictionary["CE"]
            cpuTime = 86400 * 3
            self.log.verbose("Getting pilot proxy", "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            proxy = result["Value"]
            result = proxy.getRemainingSecs()  # pylint: disable=no-member
            if not result["OK"]:
                return result
            lifetime_secs = result["Value"]
            ce.setProxy(proxy, lifetime_secs)

            # Check that there is enough slots in the remote CE to match a job
            result = self._checkCEAvailability(ce)
            if not result["OK"] or result["Value"]:
                self.failedQueues[queueName] += 1
                continue

            # Get environment details and enhance them
            result = self._getCEDict(ce)
            if not result["OK"]:
                self.failedQueues[queueName] += 1
                continue
            ceDictList = result["Value"]

            for ceDict in ceDictList:
                # Information about number of processors might not be returned in CE.getCEStatus()
                ceDict["NumberOfProcessors"] = ce.ceParameters.get("NumberOfProcessors")
                self._setCEDict(ceDict)

            # Update the configuration with the names of the Site, CE and queue to target
            # This is used in the next stages
            self._updateConfiguration("Site", queueDictionary["Site"])
            self._updateConfiguration("GridCE", queueDictionary["CEName"])
            self._updateConfiguration("CEQueue", queueDictionary["QueueName"])
            self._updateConfiguration("RemoteExecution", True)

            # Try to match a job
            jobRequest = self._matchAJob(ceDictList)
            while jobRequest["OK"]:

                # Check matcher information returned
                matcherParams = ["JDL", "DN", "Group"]
                matcherInfo = jobRequest["Value"]
                jobID = matcherInfo["JobID"]
                jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName)
                result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break

                jobJDL = matcherInfo["JDL"]
                jobGroup = matcherInfo["Group"]
                ownerDN = matcherInfo["DN"]
                ceDict = matcherInfo["CEDict"]
                matchTime = matcherInfo["matchTime"]

                optimizerParams = {}
                for key in matcherInfo:
                    if key not in matcherParams:
                        optimizerParams[key] = matcherInfo[key]

                # Get JDL paramters
                parameters = self._getJDLParameters(jobJDL)
                if not parameters["OK"]:
                    jobReport.setJobStatus(status=JobStatus.FAILED, minorStatus="Could Not Extract JDL Parameters")
                    self.log.warn("Could Not Extract JDL Parameters", parameters["Message"])
                    self.failedQueues[queueName] += 1
                    break

                params = parameters["Value"]
                result = self._extractValuesFromJobParams(params, jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break
                submissionParams = result["Value"]
                jobID = submissionParams["jobID"]
                jobType = submissionParams["jobType"]

                self.log.verbose("Job request successful: \n", jobRequest["Value"])
                self.log.info(
                    "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" % (jobID, jobType, ownerDN, jobGroup)
                )
                try:
                    jobReport.setJobParameter(par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False)
                    jobReport.setJobStatus(
                        status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False
                    )

                    # Setup proxy
                    result_setupProxy = self._setupProxy(ownerDN, jobGroup)
                    if not result_setupProxy["OK"]:
                        result = self._rescheduleFailedJob(jobID, result_setupProxy["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    proxyChain = result_setupProxy.get("Value")

                    # Check software and install them if required
                    software = self._checkInstallSoftware(jobID, params, ceDict, jobReport)
                    if not software["OK"]:
                        self.log.error("Failed to install software for job", "%s" % (jobID))
                        errorMsg = software["Message"]
                        if not errorMsg:
                            errorMsg = "Failed software installation"
                        result = self._rescheduleFailedJob(jobID, errorMsg)
                        self.failedQueues[queueName] += 1
                        break

                    # Submit the job to the CE
                    self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName))
                    result_submitJob = self._submitJob(
                        jobID=jobID,
                        jobParams=params,
                        resourceParams=ceDict,
                        optimizerParams=optimizerParams,
                        proxyChain=proxyChain,
                        jobReport=jobReport,
                        processors=submissionParams["processors"],
                        wholeNode=submissionParams["wholeNode"],
                        maxNumberOfProcessors=submissionParams["maxNumberOfProcessors"],
                        mpTag=submissionParams["mpTag"],
                    )

                    # Committing the JobReport before evaluating the result of job submission
                    res = jobReport.commit()
                    if not res["OK"]:
                        resFD = jobReport.generateForwardDISET()
                        if not resFD["OK"]:
                            self.log.error("Error generating ForwardDISET operation", resFD["Message"])
                        elif resFD["Value"]:
                            # Here we create the Request.
                            op = resFD["Value"]
                            request = Request()
                            requestName = "jobAgent_%s" % jobID
                            request.RequestName = requestName.replace('"', "")
                            request.JobID = jobID
                            request.SourceComponent = "JobAgent_%s" % jobID
                            request.addOperation(op)
                            # This might fail, but only a message would be printed.
                            self._sendFailoverRequest(request)

                    if not result_submitJob["OK"]:
                        self.log.error("Error during submission", result_submitJob["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    elif "PayloadFailed" in result_submitJob:
                        # Do not keep running and do not overwrite the Payload error
                        message = "Payload execution failed with error code %s" % result_submitJob["PayloadFailed"]
                        self.log.info(message)

                    self.log.debug("After %sCE submitJob()" % (self.ceName))

                    # Check that there is enough slots locally
                    result = self._checkCEAvailability(self.computingElement)
                    if not result["OK"] or result["Value"]:
                        return result

                    # Check that there is enough slots in the remote CE to match a new job
                    result = self._checkCEAvailability(ce)
                    if not result["OK"] or result["Value"]:
                        self.failedQueues[queueName] += 1
                        break

                    # Try to match a new job
                    jobRequest = self._matchAJob(ceDictList)
                except Exception as subExcept:  # pylint: disable=broad-except
                    self.log.exception("Exception in submission", "", lException=subExcept, lExcInfo=True)
                    result = self._rescheduleFailedJob(jobID, "Job processing failed with exception")
                    self.failedQueues[queueName] += 1
                    break

            if not jobRequest["OK"]:
                self._checkMatchingIssues(jobRequest)
                self.failedQueues[queueName] += 1
                continue

        return S_OK("Push Job Agent cycle complete")
Ejemplo n.º 9
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])
            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error(jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                self.log.error(jobRequest["Message"])
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if not params.has_key("JobID"):
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if not params.has_key("JobType"):
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if not params.has_key("SystemConfig"):
            self.log.warn("Job has no system configuration defined in JDL parameters")
            systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
            self.log.info(
                "Setting system config to /LocalSite/Architecture = %s since it was not specified" % systemConfig
            )
            if not systemConfig:
                self.log.warn("/LocalSite/Architecture is not defined")
            params["SystemConfig"] = systemConfig
        else:
            systemConfig = params["SystemConfig"]
            if systemConfig.lower() == "any":
                systemConfig = gConfig.getValue("/LocalSite/Architecture", "")
                self.log.info(
                    "Setting SystemConfig = /LocalSite/Architecture =",
                    '"%s" since it was set to "ANY" in the job description' % systemConfig,
                )
                if not systemConfig:
                    self.log.warn("/LocalSite/Architecture is not defined")
                params["SystemConfig"] = systemConfig

        if not params.has_key("CPUTime"):
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        self.log.verbose("Job request successful: \n %s" % (jobRequest["Value"]))
        self.log.info("Received JobID=%s, JobType=%s, SystemConfig=%s" % (jobID, jobType, systemConfig))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter("GridCEQueue", self.gridCEQueue, sendFlag=False)

            if os.environ.has_key("BOINC_JOB_ID"):
                # Report BOINC environment
                for p in ["BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"]:
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], params, self.stopOnApplicationFailure)
            if "Value" in result and result["Value"]:
                proxyChain = result["Value"]

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job %s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, params, self.stopOnApplicationFailure)

            self.log.verbose("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, jobJDL, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    "Payload execution failed with error code %s" % submission["PayloadFailed"],
                    self.stopOnApplicationFailure,
                )

            self.log.verbose("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", params, self.stopOnApplicationFailure
            )

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()["Value"]

        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Ejemplo n.º 10
0
  def execute(self):
    """ Main execution function.
    """
    self.log.info('Initializing %s' % self.version)
    result = self.resolveInputVariables()
    if not result['OK']:
      self.log.error(result['Message'])
      return result

    if not self.fileReport:
      self.fileReport =  FileReport('Transformation/TransformationManager')

    if self.InputData:
      inputFiles = self.fileReport.getFiles()
      for lfn in self.InputData:
        if not lfn in inputFiles:
          self.log.verbose('No status populated for input data %s, setting to "Unused"' % lfn)
          result = self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused')

    if not self.workflowStatus['OK'] or not self.stepStatus['OK']:
      self.log.info('Workflow status = %s, step status = %s' %(self.workflowStatus['OK'], self.stepStatus['OK']))
      inputFiles = self.fileReport.getFiles()
      for lfn in inputFiles:
        if inputFiles[lfn] != 'ApplicationCrash':
          self.log.info('Forcing status to "Unused" due to workflow failure for: %s' % (lfn))
          self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused')
    else:
      inputFiles = self.fileReport.getFiles()
      if inputFiles:
        self.log.info('Workflow status OK, setting input file status to Processed')                
      for lfn in inputFiles:
        self.log.info('Setting status to "Processed" for: %s' % (lfn))
        self.fileReport.setFileStatus(int(self.productionID), lfn, 'Processed')  

    result = self.fileReport.commit()
    if not result['OK']:
      self.log.error('Failed to report file status to ProductionDB, request will be generated', result['Message'])
    else:
      self.log.info('Status of files have been properly updated in the ProcessingDB')

    # Must ensure that the local job report instance is used to report the final status
    # in case of failure and a subsequent failover operation
    if self.workflowStatus['OK'] and self.stepStatus['OK']: 
      if not self.jobReport:
        self.jobReport = JobReport(int(self.jobID))
      jobStatus = self.jobReport.setApplicationStatus('Job Finished Successfully')
      if not jobStatus['OK']:
        self.log.warn(jobStatus['Message'])

    # Retrieve the accumulated reporting request
    reportRequest = None
    if self.jobReport:
      result = self.jobReport.generateRequest()
      if not result['OK']:
        self.log.warn('Could not generate request for job report with result:\n%s' % (result))
      else:
        reportRequest = result['Value']
    if reportRequest:
      self.log.info('Populating request with job report information')
      self.request.update(reportRequest)

    fileReportRequest = None
    if self.fileReport:
      result = self.fileReport.generateRequest()
      if not result['OK']:
        self.log.warn('Could not generate request for file report with result:\n%s' % (result))
      else:
        fileReportRequest = result['Value']
    if fileReportRequest:
      self.log.info('Populating request with file report information')
      result = self.request.update(fileReportRequest)

    accountingReport = None
    if self.workflow_commons.has_key('AccountingReport'):
      accountingReport = self.workflow_commons['AccountingReport']
    if accountingReport:
      result = accountingReport.commit()
      if not result['OK']:
        self.log.info('Populating request with accounting report information')
        self.request.setDISETRequest(result['rpcStub'])

    if self.request.isEmpty()['Value']:
      self.log.info('Request is empty, nothing to do.')
      return self.finalize()

    request_string = self.request.toXML()['Value']
    self.log.debug(request_string)
    # Write out the request string
    fname = '%s_%s_request.xml' % (self.productionID, self.prodJobID)
    xmlfile = open(fname, 'w')
    xmlfile.write(request_string)
    xmlfile.close()
    self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID)
    result = self.request.getDigest()
    if result['OK']:
      digest = result['Value']
      self.log.info(digest)

    if not self.enable:
      self.log.info('Module is disabled by control flag')
      return S_OK('Module is disabled by control flag')

    return self.finalize()
Ejemplo n.º 11
0
class FailoverRequest(ModuleBase):
  """ Handle the failover requests issued by previous steps. Used in production. 
  """
  #############################################################################
  def __init__(self):
    """Module initialization.
    """
    super(FailoverRequest, self).__init__()
    self.version = __RCSID__
    self.log = gLogger.getSubLogger( "FailoverRequest" )
    #Internal parameters
    self.enable = True
    self.jobID = ''
    self.productionID = None
    self.prodJobID = None
    #Workflow parameters
    self.jobReport  = None
    self.fileReport = None
    self.request = None

  #############################################################################
  def applicationSpecificInputs(self):
    """ By convention the module input parameters are resolved here.
    """
    self.log.debug(self.workflow_commons)
    self.log.debug(self.step_commons)

    if os.environ.has_key('JOBID'):
      self.jobID = os.environ['JOBID']
      self.log.verbose('Found WMS JobID = %s' %self.jobID)
    else:
      self.log.info('No WMS JobID found, disabling module via control flag')
      self.enable = False

    if self.step_commons.has_key('Enable'):
      self.enable = self.step_commons['Enable']
      if not type(self.enable) == type(True):
        self.log.warn('Enable flag set to non-boolean value %s, setting to False' % self.enable)
        self.enable = False

    #Earlier modules will have populated the report objects
    if self.workflow_commons.has_key('JobReport'):
      self.jobReport = self.workflow_commons['JobReport']

    if self.workflow_commons.has_key('FileReport'):
      self.fileReport = self.workflow_commons['FileReport']

    if self.InputData:
      if type(self.InputData) != type([]):
        self.InputData = self.InputData.split(';')

      self.InputData = [x.replace('LFN:','') for x in self.InputData]

    if self.workflow_commons.has_key('Request'):
      self.request = self.workflow_commons['Request']
    if not self.request:
      self.request = RequestContainer()
      self.request.setRequestName('job_%s_request.xml' % self.jobID)
      self.request.setJobID(self.jobID)
      self.request.setSourceComponent("Job_%s" % self.jobID)

    if self.workflow_commons.has_key('PRODUCTION_ID'):
      self.productionID = self.workflow_commons['PRODUCTION_ID']

    if self.workflow_commons.has_key('JOB_ID'):
      self.prodJobID = self.workflow_commons['JOB_ID']

    return S_OK('Parameters resolved')

  #############################################################################
  def execute(self):
    """ Main execution function.
    """
    self.log.info('Initializing %s' % self.version)
    result = self.resolveInputVariables()
    if not result['OK']:
      self.log.error(result['Message'])
      return result

    if not self.fileReport:
      self.fileReport =  FileReport('Transformation/TransformationManager')

    if self.InputData:
      inputFiles = self.fileReport.getFiles()
      for lfn in self.InputData:
        if not lfn in inputFiles:
          self.log.verbose('No status populated for input data %s, setting to "Unused"' % lfn)
          result = self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused')

    if not self.workflowStatus['OK'] or not self.stepStatus['OK']:
      self.log.info('Workflow status = %s, step status = %s' %(self.workflowStatus['OK'], self.stepStatus['OK']))
      inputFiles = self.fileReport.getFiles()
      for lfn in inputFiles:
        if inputFiles[lfn] != 'ApplicationCrash':
          self.log.info('Forcing status to "Unused" due to workflow failure for: %s' % (lfn))
          self.fileReport.setFileStatus(int(self.productionID), lfn, 'Unused')
    else:
      inputFiles = self.fileReport.getFiles()
      if inputFiles:
        self.log.info('Workflow status OK, setting input file status to Processed')                
      for lfn in inputFiles:
        self.log.info('Setting status to "Processed" for: %s' % (lfn))
        self.fileReport.setFileStatus(int(self.productionID), lfn, 'Processed')  

    result = self.fileReport.commit()
    if not result['OK']:
      self.log.error('Failed to report file status to ProductionDB, request will be generated', result['Message'])
    else:
      self.log.info('Status of files have been properly updated in the ProcessingDB')

    # Must ensure that the local job report instance is used to report the final status
    # in case of failure and a subsequent failover operation
    if self.workflowStatus['OK'] and self.stepStatus['OK']: 
      if not self.jobReport:
        self.jobReport = JobReport(int(self.jobID))
      jobStatus = self.jobReport.setApplicationStatus('Job Finished Successfully')
      if not jobStatus['OK']:
        self.log.warn(jobStatus['Message'])

    # Retrieve the accumulated reporting request
    reportRequest = None
    if self.jobReport:
      result = self.jobReport.generateRequest()
      if not result['OK']:
        self.log.warn('Could not generate request for job report with result:\n%s' % (result))
      else:
        reportRequest = result['Value']
    if reportRequest:
      self.log.info('Populating request with job report information')
      self.request.update(reportRequest)

    fileReportRequest = None
    if self.fileReport:
      result = self.fileReport.generateRequest()
      if not result['OK']:
        self.log.warn('Could not generate request for file report with result:\n%s' % (result))
      else:
        fileReportRequest = result['Value']
    if fileReportRequest:
      self.log.info('Populating request with file report information')
      result = self.request.update(fileReportRequest)

    accountingReport = None
    if self.workflow_commons.has_key('AccountingReport'):
      accountingReport = self.workflow_commons['AccountingReport']
    if accountingReport:
      result = accountingReport.commit()
      if not result['OK']:
        self.log.info('Populating request with accounting report information')
        self.request.setDISETRequest(result['rpcStub'])

    if self.request.isEmpty()['Value']:
      self.log.info('Request is empty, nothing to do.')
      return self.finalize()

    request_string = self.request.toXML()['Value']
    self.log.debug(request_string)
    # Write out the request string
    fname = '%s_%s_request.xml' % (self.productionID, self.prodJobID)
    xmlfile = open(fname, 'w')
    xmlfile.write(request_string)
    xmlfile.close()
    self.log.info('Creating failover request for deferred operations for job %s:' % self.jobID)
    result = self.request.getDigest()
    if result['OK']:
      digest = result['Value']
      self.log.info(digest)

    if not self.enable:
      self.log.info('Module is disabled by control flag')
      return S_OK('Module is disabled by control flag')

    return self.finalize()

  #############################################################################
  def finalize(self):
    """ Finalize and report correct status for the workflow based on the workflow
        or step status.
    """
    self.log.verbose('Workflow status = %s, step status = %s' % (self.workflowStatus['OK'], self.stepStatus['OK']))
    if not self.workflowStatus['OK'] or not self.stepStatus['OK']:
      self.log.warn('Workflow status is not ok, will not overwrite status')
      self.log.info('Workflow failed, end of FailoverRequest module execution.')
      return S_ERROR('Workflow failed, FailoverRequest module completed')

    self.log.info('Workflow successful, end of FailoverRequest module execution.')
    return S_OK('FailoverRequest module completed')

#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#EOF#
Ejemplo n.º 12
0
def main():

    from DIRAC.Core.Base import Script
    Script.initialize()

    DIRAC.gLogger.notice('Platform is:')
    os.system('dirac-platform')
    from DIRAC.DataManagementSystem.Client.DataManager import DataManager
    from CTADIRAC.Core.Workflow.Modules.EvnDispApp import EvnDispApp
    from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
    from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
    from DIRAC.Core.Utilities.Subprocess import systemCall
    from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

    jobID = os.environ['JOBID']
    jobID = int(jobID)
    jobReport = JobReport(jobID)

    version = sys.argv[3]
    DIRAC.gLogger.notice('Version:', version)

    EvnDispPack = os.path.join('evndisplay', version, 'evndisplay')

    packs = [EvnDispPack]

    for package in packs:
        DIRAC.gLogger.notice('Checking:', package)
        if checkSoftwarePackage(package, sharedArea())['OK']:
            DIRAC.gLogger.notice('Package found in Shared Area:', package)
            installSoftwareEnviron(package, sharedArea())
            continue
        else:
            installSoftwarePackage(package, workingArea())
            DIRAC.gLogger.notice('Package found in workingArea:', package)
            continue

        DIRAC.gLogger.error('Check Failed for software package:', package)
        DIRAC.gLogger.error('Software package not available')
        DIRAC.exit(-1)

    ed = EvnDispApp()
    ed.setSoftwarePackage(EvnDispPack)

    dstFileLFNList = sys.argv[-1].split('ParametricParameters={')[1].split(
        '}')[0].replace(',', ' ')

    args = []
    i = 0
    for word in dstFileLFNList.split():
        i = i + 1
        dstfile = os.path.basename(word)
        ###### execute evndisplay stage1 ###############
        executable = sys.argv[5]
        logfileName = executable + '_' + str(i) + '.log'
        args = ['-sourcefile', dstfile, '-outputdirectory', 'outdir']
        # add other arguments for evndisp specified by user ######
        evndispparfile = open('evndisp.par', 'r').readlines()
        for line in evndispparfile:
            for word in line.split():
                args.append(word)

        execute_module(ed, executable, args)

        for name in glob.glob('outdir/*.root'):
            evndispOutFile = name.split('.root')[0] + '_' + str(
                jobID) + '_evndisp.root'
            cmd = 'mv ' + name + ' ' + os.path.basename(evndispOutFile)
            if (os.system(cmd)):
                DIRAC.exit(-1)

########### quality check on Log #############################################
        cmd = 'mv ' + executable + '.log' + ' ' + logfileName
        if (os.system(cmd)):
            DIRAC.exit(-1)
        fd = open('check_log.sh', 'w')
        fd.write("""#! /bin/sh
if grep -i "error" %s; then
exit 1
fi
if grep "Final checks on result file (seems to be OK):" %s; then
exit 0
else
exit 1
fi
""" % (logfileName, logfileName))
        fd.close()

        os.system('chmod u+x check_log.sh')
        cmd = './check_log.sh'
        DIRAC.gLogger.notice('Executing system call:', cmd)
        if (os.system(cmd)):
            jobReport.setApplicationStatus('EvnDisp Log Check Failed')
            DIRAC.exit(-1)


##################################################################
########### remove the dst file #############################################
        cmd = 'rm ' + dstfile
        if (os.system(cmd)):
            DIRAC.exit(-1)

    DIRAC.exit()
Ejemplo n.º 13
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info("Attempting to check CPU time left for filling mode")
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info("%s normalized CPU units remaining in slot" % (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish("No more time left")
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(cpuTimeLeft=self.timeLeft)
                if not result["OK"]:
                    return self.__finish(result["Message"])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join(".", self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc", "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection("/LocalSite"):
                    localCfg.createNewSection("/LocalSite")
                localCfg.setOption("/LocalSite/CPUTimeLeft", self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish("Filling Mode is Disabled")

        self.log.verbose("Job Agent execution loop")
        available = self.computingElement.available()
        if not available["OK"] or not available["Value"]:
            self.log.info("Resource is not available")
            self.log.info(available["Message"])
            return self.__finish("CE Not Available")

        self.log.info(available["Message"])

        result = self.computingElement.getDescription()
        if not result["OK"]:
            return result
        ceDict = result["Value"]

        # Add pilot information
        gridCE = gConfig.getValue("LocalSite/GridCE", "Unknown")
        if gridCE != "Unknown":
            ceDict["GridCE"] = gridCE
        if not "PilotReference" in ceDict:
            ceDict["PilotReference"] = str(self.pilotReference)
        ceDict["PilotBenchmark"] = self.cpuFactor
        ceDict["PilotInfoReportedFlag"] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict("/AgentJobRequirements")
        if result["OK"]:
            requirementsDict = result["Value"]
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info("MatcherTime = %.2f (s)" % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption("StopAfterFailedMatches", self.stopAfterFailedMatches)

        if not jobRequest["OK"]:
            if re.search("No match found", jobRequest["Message"]):
                self.log.notice("Job request OK: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("seconds timeout") != -1:
                self.log.error("Timeout while requesting job", jobRequest["Message"])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])
            elif jobRequest["Message"].find("Pilot version does not match") != -1:
                errorMsg = "Pilot version does not match the production version"
                self.log.error(errorMsg, jobRequest["Message"].replace(errorMsg, ""))
                return S_ERROR(jobRequest["Message"])
            else:
                self.log.notice("Failed to get jobs: %s" % (jobRequest["Message"]))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish("Nothing to do for more than %d cycles" % self.stopAfterFailedMatches)
                return S_OK(jobRequest["Message"])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest["Value"]
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get("PilotInfoReportedFlag", False)
        jobID = matcherInfo["JobID"]
        matcherParams = ["JDL", "DN", "Group"]
        for param in matcherParams:
            if param not in matcherInfo:
                self.__report(jobID, "Failed", "Matcher did not return %s" % (param))
                return self.__finish("Matcher Failed")
            elif not matcherInfo[param]:
                self.__report(jobID, "Failed", "Matcher returned null %s" % (param))
                return self.__finish("Matcher Failed")
            else:
                self.log.verbose("Matcher returned %s = %s " % (param, matcherInfo[param]))

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters["OK"]:
            self.__report(jobID, "Failed", "Could Not Extract JDL Parameters")
            self.log.warn(parameters["Message"])
            return self.__finish("JDL Problem")

        params = parameters["Value"]
        if "JobID" not in params:
            msg = "Job has not JobID defined in JDL parameters"
            self.__report(jobID, "Failed", msg)
            self.log.warn(msg)
            return self.__finish("JDL Problem")
        else:
            jobID = params["JobID"]

        if "JobType" not in params:
            self.log.warn("Job has no JobType defined in JDL parameters")
            jobType = "Unknown"
        else:
            jobType = params["JobType"]

        if "CPUTime" not in params:
            self.log.warn("Job has no CPU requirement defined in JDL parameters")

        if self.extraOptions:
            params["Arguments"] += " " + self.extraOptions
            params["ExtraOptions"] = self.extraOptions

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info("Received JobID=%s, JobType=%s" % (jobID, jobType))
        self.log.info("OwnerDN: %s JobGroup: %s" % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
            jobReport.setJobParameter("MatcherServiceTime", str(matchTime), sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for p in ("BoincUserID", "BoincHostID", "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(p, gConfig.getValue("/LocalSite/%s" % p, "Unknown"), sendFlag=False)

            jobReport.setJobStatus("Matched", "Job Received by Agent")
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result["OK"]:
                return self.__rescheduleFailedJob(jobID, result["Message"], self.stopOnApplicationFailure)
            proxyChain = result.get("Value")

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software["OK"]:
                self.log.error("Failed to install software for job", "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                return self.__rescheduleFailedJob(jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug("Before %sCE submitJob()" % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict, optimizerParams, proxyChain)
            if not submission["OK"]:
                self.__report(jobID, "Failed", submission["Message"])
                return self.__finish(submission["Message"])
            elif "PayloadFailed" in submission:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % submission["PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self.__finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, "Job processing failed with exception", self.stopOnApplicationFailure
            )

        # Sum all times but the last one (elapsed_time) and remove times at init (is this correct?)
        cpuTime = sum(os.times()[:-1]) - sum(self.initTimes[:-1])

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result["OK"]:
            self.timeLeft = result["Value"]
        else:
            if result["Message"] != "Current batch system is not supported":
                self.timeLeftError = result["Message"]
            else:
                # if the batch system is not defined, use the process time and the CPU normalization defined locally
                self.timeLeft = self.__getCPUTimeLeft()

        scaledCPUTime = self.timeLeftUtil.getScaledCPU()
        self.__setJobParam(jobID, "ScaledCPUTime", str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK("Job Agent cycle complete")
Ejemplo n.º 14
0
#!/usr/bin/env python

import os
import sys

from DIRAC.Core.Base import Script

Script.initialize(ignoreErrors=True)

from DIRAC.Interfaces.API.Dirac import Dirac
from DIRAC.Interfaces.API.Job import Job

from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

jobID = os.environ.get('DIRACJOBID', '0')

if not jobID:
    print 'DIRAC job ID not found'
    sys.exit(1)

jobReport = JobReport(jobID, 'JUNO_JobScript')
result = jobReport.setApplicationStatus(', '.join(sys.argv[1:]))
if not result['OK']:
    print 'Set application status error: %s' % result
Ejemplo n.º 15
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']
    
    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:  
      ceDict['PilotReference'] = str( self.pilotReference ) 
    ceDict['PilotBenchmark'] = self.cpuFactor 
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag
    
    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No work available', jobRequest['Message'] ):
        self.log.info( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.info( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return S_ERROR( 'Nothing to do' )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'SystemConfig' ):
      self.log.warn( 'Job has no system configuration defined in JDL parameters' )
      systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
      self.log.info( 'Setting system config to /LocalSite/Architecture = %s since it was not specified' % systemConfig )
      if not systemConfig:
        self.log.warn( '/LocalSite/Architecture is not defined' )
      params['SystemConfig'] = systemConfig
    else:
      systemConfig = params['SystemConfig']
      if systemConfig.lower() == 'any':
        systemConfig = gConfig.getValue( '/LocalSite/Architecture', '' )
        self.log.info( 'Setting SystemConfig = /LocalSite/Architecture =',
                       '"%s" since it was set to "ANY" in the job description' % systemConfig )
        if not systemConfig:
          self.log.warn( '/LocalSite/Architecture is not defined' )
        params['SystemConfig'] = systemConfig

    if not params.has_key( 'MaxCPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s, SystemConfig=%s' % ( jobID, jobType, systemConfig ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )
      if self.gridCEQueue:
        jobReport.setJobParameter( 'GridCEQueue', self.gridCEQueue, sendFlag = False )
      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      # self.__setJobSite( jobID, self.siteName )
      if not self.pilotInfoReportedFlag:
        self.__reportPilotInfo( jobID )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Is this necessary at all?
      saveJDL = self.__saveJobJDLRequest( jobID, jobJDL )
      #self.__report(jobID,'Matched','Job Prepared to Submit')

      #resourceParameters = self.__getJDLParameters( resourceJDL )
      #if not resourceParameters['OK']:
      #  return resourceParameters
      #resourceParams = resourceParameters['Value']

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.verbose( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, jobJDL, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.verbose( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    result = self.timeLeftUtil.getTimeLeft( 0.0 )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Ejemplo n.º 16
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception, x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
Ejemplo n.º 17
0
def main():

  from DIRAC.Core.Base import Script

### make_CTA_DST options ###############################################
  Script.registerSwitch( "R:", "run_number=", "Run Number", setRunNumber )
  Script.registerSwitch( "I:", "infile=", "Input file", setInfile )
  Script.registerSwitch( "T:", "tellist=", "Tellist", setTellist )
  Script.registerSwitch( "N:", "nevent=", "Nevent", setNevent )
### other options ###############################################
  Script.registerSwitch( "V:", "version=", "HAP version", setVersion )
   
  Script.parseCommandLine( ignoreErrors = True ) 
  
  args = Script.getPositionalArgs()

  if len( args ) < 1:
    Script.showHelp()
  
  if infile == None or tellist == None or version == None:
    Script.showHelp()
    jobReport.setApplicationStatus('Options badly specified')
    DIRAC.exit( -1 ) 
   
  from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro
  from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
  from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
  from DIRAC.Core.Utilities.Subprocess import systemCall  
  from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
  
  jobID = os.environ['JOBID']
  jobID = int( jobID )
  jobReport = JobReport( jobID )
  
  HapPack = 'HAP/' + version + '/HAP'

  packs = ['HESS/v0.2/lib','HESS/v0.3/root',HapPack] 

  for package in packs:
    DIRAC.gLogger.notice( 'Checking:', package )
    if sharedArea:
      if checkSoftwarePackage( package, sharedArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Shared Area:', package )
        continue
    if localArea:
      if checkSoftwarePackage( package, localArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Local Area:', package )
        continue
      if installSoftwarePackage( package, localArea() )['OK']:
        continue
    DIRAC.gLogger.error( 'Check Failed for software package:', package )
    DIRAC.gLogger.error( 'Software package not available')
    DIRAC.exit( -1 ) 

  hr = HapRootMacro()
  hr.setSoftwarePackage(HapPack)

  telconf = os.path.join( localArea(),'HAP/%s/config/%s' % (version,tellist)) 
  infilestr = '"' + infile + '"'
  telconfstr = '"' + telconf + '"'
  args = [str(int(RunNum)), infilestr, telconfstr]
  
  try:
    args.extend([nevent])
  except NameError:
    DIRAC.gLogger.info( 'nevent arg not used' )
 
  DIRAC.gLogger.notice( 'make_CTA_DST macro Arguments:', args )
  hr.rootMacro = '/hapscripts/dst/make_CTA_DST.C+'
  hr.rootArguments = args
  DIRAC.gLogger.notice( 'Executing Hap make_CTA_DST macro' )
  res = hr.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Failed to execute make_CTA_DST macro')
    jobReport.setApplicationStatus('Failure during make_CTA_DST')
    DIRAC.exit( -1 )

############ check existance of output file ####
  filedst = 'dst_CTA_%08d' % int(RunNum) + '.root'

  if not os.path.isfile(filedst):
    DIRAC.gLogger.error('dst file not found:', filedst ) 
    jobReport.setApplicationStatus('make_CTA_DST.C: DST file not created')
    DIRAC.exit( -1 )

###################Check std out #############################
  DIRAC.gLogger.notice('Executing DST Check step0')
  
  ret = getSoftwareEnviron(HapPack)
  if not ret['OK']:
    error = ret['Message']
    DIRAC.gLogger.error( error, HapPack)
    DIRAC.exit( -1 )

  hapEnviron = ret['Value']
  hessroot =  hapEnviron['HESSROOT']
  check_script = hessroot + '/hapscripts/dst/check_dst0.csh'
  cmdTuple = [check_script]

  ret = systemCall( 0, cmdTuple, sendOutput)
       
  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute DST Check step0')
    jobReport.setApplicationStatus('Check_dst0: Failed')
    DIRAC.exit( -1 )

  status, stdout, stderr = ret['Value']
  if status==1:
    jobReport.setApplicationStatus('Check_dst0: Big problem during the DST production')
    DIRAC.gLogger.error( 'DST Check step0 reports: Big problem during the DST production' )
    DIRAC.exit( -1 )
  if status==2:
    jobReport.setApplicationStatus('Check_dst0: No triggered events')
    DIRAC.gLogger.notice( 'DST Check step0 reports: No triggered events' )
    DIRAC.exit( )

############# run the CheckDST macro #################
  DIRAC.gLogger.notice('Executing DST check step1')
  hr.rootMacro = '/hapscripts/dst/CheckDST.C+'
  fileoutstr = '"' + filedst + '"'
  args = [fileoutstr] 
  DIRAC.gLogger.notice( 'CheckDST macro Arguments:', args )
  hr.rootArguments = args
  DIRAC.gLogger.notice( 'Executing Hap CheckDST macro')
  res = hr.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Failure during DST Check step1' )
    jobReport.setApplicationStatus('Check_dst1: Failed')
    DIRAC.exit( -1 )

######################check stdout of CheckDST macro ###########################
  DIRAC.gLogger.notice('Executing DST Check step2')
  check_script = hessroot + '/hapscripts/dst/check_dst2.csh'
  cmdTuple = [check_script]
  ret = systemCall( 0, cmdTuple, sendOutput )
       
  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute DST Check step2')
    jobReport.setApplicationStatus('Check_dst2: Failed')
    DIRAC.exit( -1 )

  status, stdout, stderr = ret['Value']
  if status==1:
    jobReport.setApplicationStatus('DST Check step2: Big problem during the DST production')
    DIRAC.gLogger.error( 'DST Check step2 reports: Big problem during the DST production' )
    DIRAC.exit( -1 )
  if status==2:
    jobReport.setApplicationStatus('DST Check step2: No triggered events')
    DIRAC.gLogger.notice( 'DST Check step2 reports: No triggered events' )
    DIRAC.exit( )

  DIRAC.exit()
Ejemplo n.º 18
0
def main():

  from DIRAC.Core.Base import Script
  Script.initialize() 

  DIRAC.gLogger.notice('Platform is:')
  os.system('dirac-platform')
  from CTADIRAC.Core.Workflow.Modules.Read_CtaApp import Read_CtaApp
  from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
  from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
  from DIRAC.Core.Utilities.Subprocess import systemCall
  from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

  jobID = os.environ['JOBID']
  jobID = int( jobID )
  jobReport = JobReport( jobID )

  version = sys.argv[3]
  DIRAC.gLogger.notice( 'Version:', version )
  install_CorsikaSimtelPack(version)

  ######### run read_cta #######################################

  rcta = Read_CtaApp()
  CorsikaSimtelPack = os.path.join('corsika_simhessarray',version,'corsika_simhessarray')
  rcta.setSoftwarePackage(CorsikaSimtelPack)
  rcta.rctaExe = 'read_cta'

  # add arguments for read_cta specified by user ######
  args = []
  rctaparfile = open('read_cta.par', 'r').readlines()  
  for line in rctaparfile:
    for word in line.split():
      args.append(word) 

  simtelFileLFN = sys.argv[-1].split('ParametricInputData=LFN:')[1]
  simtelFileName = os.path.basename(simtelFileLFN)
  dstFileName = simtelFileName.replace('simtel.gz','simtel-dst.gz')
  dstHistoFileName = simtelFileName.replace('simtel.gz','hdata-dst.gz')

  args.extend(['--dst-file', dstFileName, '--histogram-file', dstHistoFileName, simtelFileName])
  rcta.rctaArguments = args

  rctaReturnCode = rcta.execute()
  
  if rctaReturnCode != 0:
    DIRAC.gLogger.error( 'read_cta Application: Failed')
    jobReport.setApplicationStatus('read_cta Application: Failed')
    DIRAC.exit( -1 )
#################################################################
  from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
  ret = getSoftwareEnviron( CorsikaSimtelPack )

  if not ret['OK']:
    error = ret['Message']
    DIRAC.gLogger.error( error, CorsikaSimtelPack )
    DIRAC.exit( -1 )

  read_ctaEnviron = ret['Value']

######## run dst quality checks ######################################

  fd = open('check_dst_histo.sh', 'w' )
  fd.write( """#! /bin/sh  
dsthistfilename=%s
dstfile=%s
n6="$(list_histograms -h 6 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')"
n12001="$(list_histograms -h 12001 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')"
if [ $n6 -ne $n12001 ]; then
echo 'n6 found:' $n6
echo 'n12001 found:' $n12001
exit 1
else
echo 'n6 found:' $n6
echo 'n12001 found:' $n12001
fi

n12002="$(list_histograms -h 12002 ${dsthistfilename} | grep 'Histogram of type' | sed 's/.*bins, //' | sed 's/ entries.//')"
nev="$(statio ${dstfile} | egrep '^2010' | cut -f2)"
if [ -z "$nev" ]; then nev="0"; fi

if [ $nev -ne $n12002 ]; then
echo 'nev found:' $nev
echo 'n12002 found:' $n12002
exit 1
else
echo 'nev found:' $nev
echo 'n12002 found:' $n12002
fi
""" % (dstHistoFileName,dstFileName))
  fd.close()

  os.system('chmod u+x check_dst_histo.sh')
  cmdTuple = ['./check_dst_histo.sh']
  DIRAC.gLogger.notice( 'Executing command tuple:', cmdTuple )
  ret = systemCall( 0, cmdTuple, sendOutput,env = read_ctaEnviron)
  checkHistoReturnCode, stdout, stderr = ret['Value']

  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute check_dst_histo.sh')
    DIRAC.gLogger.error( 'check_dst_histo.sh status is:', checkHistoReturnCode)
    DIRAC.exit( -1 )

  if (checkHistoReturnCode!=0):
    DIRAC.gLogger.error( 'Failure during check_dst_histo.sh')
    DIRAC.gLogger.error( 'check_dst_histo.sh status is:', checkHistoReturnCode)
    jobReport.setApplicationStatus('Histo check Failed')
    DIRAC.exit( -1 )

  DIRAC.exit()
Ejemplo n.º 19
0
def main():

  from DIRAC.Core.Base import Script

#### eventio_cta options ##########################################
  Script.registerSwitch( "T:", "tellist=", "Tellist", setTellist )
  Script.registerSwitch( "F:", "Nfirst_mcevt=", "Nfirst_mcevt", setNfirst_mcevt)
  Script.registerSwitch( "L:", "Nlast_mcevt=", "Nlast_mcevt", setNlast_mcevt)
## add other eventio_cta options ################################
#  Script.registerSwitch( "N:", "num=", "Num", setNum)
##  Script.registerSwitch( "L:", "limitmc=", "Limitmc", setLimitmc)
#  Script.registerSwitch( "S:", "telidoffset=", "Telidoffset", setTelidoffset)
  Script.registerSwitch( "P:", "pixelslices=", "setPixelslices (true/false)",setPixelslices)
  Script.registerSwitch( "p:", "run_number=", "Run Number (set automatically)", setRunNumber ) 
### other options ###############################################
  Script.registerSwitch( "V:", "version=", "HAP version", setVersion )

  Script.parseCommandLine( ignoreErrors = True ) 
  
  args = Script.getPositionalArgs()

  if len( args ) < 1:
    Script.showHelp()
  
  if tellist == None or version == None:
    Script.showHelp()
    jobReport.setApplicationStatus('Options badly specified')
    DIRAC.exit( -1 ) 
   
  from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication
  from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro
  from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
  from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
  from DIRAC.Core.Utilities.Subprocess import systemCall
  from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

  jobID = os.environ['JOBID']
  jobID = int( jobID )
  jobReport = JobReport( jobID )

  HapPack = 'HAP/' + version + '/HAP'

  packs = ['HESS/v0.2/lib','HESS/v0.3/root',HapPack] 

  for package in packs:
    DIRAC.gLogger.notice( 'Checking:', package )
    if sharedArea:
      if checkSoftwarePackage( package, sharedArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Shared Area:', package )
        continue
    if localArea:
      if checkSoftwarePackage( package, localArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Local Area:', package )
        continue
      if installSoftwarePackage( package, localArea() )['OK']:
        continue
    DIRAC.gLogger.error( 'Check Failed for software package:', package )
    DIRAC.gLogger.error( 'Software package not available')
    DIRAC.exit( -1 ) 

  telconf = os.path.join( localArea(),'HAP/%s/config/%s' % (version,tellist)) 

  ha = HapApplication()
  ha.setSoftwarePackage(HapPack)
  ha.hapExecutable = 'eventio_cta'

  fileout = 'raw_' + part_type + '_run' + run_number + '.root'
  infile = build_infile()
  ha.hapArguments = ['-file', infile, '-o', fileout, '-tellist', telconf]

  try:
    ha.hapArguments.extend(['-Nfirst_mcevt', Nfirst_mcevt, '-Nlast_mcevt', Nlast_mcevt])
  except NameError:
    DIRAC.gLogger.info( 'Nfirst_mcevt/Nlast_mcevt options are not used' )

  try:
    if(pixelslices == 'true'):
      ha.hapArguments.extend(['-pixelslices'])
  except NameError:
      DIRAC.gLogger.info( 'pixelslices option is not used' )

  DIRAC.gLogger.notice( 'Executing Hap Converter Application' )
  res = ha.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Failed to execute eventio_cta Application')
    jobReport.setApplicationStatus('eventio_cta: Failed')
    DIRAC.exit( -1 )
  
  if not os.path.isfile(fileout):
    error = 'raw file was not created:'
    DIRAC.gLogger.error( error, fileout )
    jobReport.setApplicationStatus('eventio_cta: RawData not created')
    DIRAC.exit( -1 )

###################### Check RAW DATA #######################
  hr = HapRootMacro()
  hr.setSoftwarePackage(HapPack)
 
  DIRAC.gLogger.notice('Executing RAW check step0')
  hr.rootMacro = '/hapscripts/dst/Open_Raw.C+'
  outfilestr = '"' + fileout + '"'
  args = [outfilestr]
  DIRAC.gLogger.notice( 'Open_Raw macro Arguments:', args )
  hr.rootArguments = args
  DIRAC.gLogger.notice( 'Executing Hap Open_Raw macro')
  res = hr.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Open_Raw: Failed' )
    DIRAC.exit( -1 )

#################Check stdout of 'Open_Raw.C macro ###############################                                                                                                              
  DIRAC.gLogger.notice('Executing Raw Check step1')
    
  ret = getSoftwareEnviron(HapPack)
  if not ret['OK']:
    error = ret['Message']
    DIRAC.gLogger.error( error, HapPack)
    DIRAC.exit( -1 )

  hapEnviron = ret['Value']
  hessroot =  hapEnviron['HESSROOT']
  check_script = hessroot + '/hapscripts/dst/check_raw.csh'
  cmdTuple = [check_script]
  ret = systemCall( 0, cmdTuple, sendOutput)

  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute RAW Check step1')
    jobReport.setApplicationStatus('Check_raw: Failed')
    DIRAC.exit( -1 )

  status, stdout, stderr = ret['Value']
  if status==1:
    jobReport.setApplicationStatus('RAW Check step1: Big problem during RAW production')
    DIRAC.gLogger.error( 'Check_raw: Big problem during RAW production' )
    DIRAC.exit( -1 )

############## DST production #######################"
  hr = HapRootMacro()
  hr.setSoftwarePackage(HapPack)

  infile = build_infile()
  infilestr = '"' + fileout + '"'
  telconfstr = '"' + telconf + '"'
  args = [str(int(run_number)), infilestr, telconfstr]
  
  try:
    args.extend([nevent])
  except NameError:
    DIRAC.gLogger.info( 'nevent arg not used' )
 
  DIRAC.gLogger.notice( 'make_CTA_DST macro Arguments:', args )
  hr.rootMacro = '/hapscripts/dst/make_CTA_DST.C+'
  hr.rootArguments = args
  DIRAC.gLogger.notice( 'Executing Hap make_CTA_DST macro' )
  res = hr.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Failed to execute make_CTA_DST macro')
    jobReport.setApplicationStatus('Failure during make_CTA_DST')
    DIRAC.exit( -1 )

############ check existance of output file ####
  filedst = 'dst_CTA_%08d' % int(run_number) + '.root'

  if not os.path.isfile(filedst):
    DIRAC.gLogger.error('dst file not found:', filedst )
    jobReport.setApplicationStatus('make_CTA_DST.C: DST file not created')
    DIRAC.exit( -1 )

  fileout = 'dst_' + part_type + '_run' + run_number + '.root'
  cmd = 'mv ' + filedst + ' ' + fileout
  os.system(cmd)

#####################Check stdout ###########################
  DIRAC.gLogger.notice('Executing DST Check step0')
    
  check_script = hessroot + '/hapscripts/dst/check_dst0.csh'
  cmdTuple = [check_script]
  ret = systemCall( 0, cmdTuple, sendOutput)
       
  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute DST Check step0')
    jobReport.setApplicationStatus('Check_dst0: Failed')
    DIRAC.exit( -1 )

  status, stdout, stderr = ret['Value']
  if status==1:
    jobReport.setApplicationStatus('Check_dst0: Big problem during the DST production')
    DIRAC.gLogger.error( 'DST Check step0 reports: Big problem during the DST production' )
    DIRAC.exit( -1 )
  if status==2:
    jobReport.setApplicationStatus('Check_dst0: No triggered events')
    DIRAC.gLogger.notice( 'DST Check step0 reports: No triggered events' )
    DIRAC.exit( )

############# run the CheckDST macro #################
  DIRAC.gLogger.notice('Executing DST check step1')
  hr.rootMacro = '/hapscripts/dst/CheckDST.C+'
  fileoutstr = '"' + fileout + '"'
  args = [fileoutstr]
  DIRAC.gLogger.notice( 'CheckDST macro Arguments:', args )
  hr.rootArguments = args
  DIRAC.gLogger.notice( 'Executing Hap CheckDST macro')
  res = hr.execute()

  if not res['OK']:
    DIRAC.gLogger.error( 'Failure during DST Check step1' )
    jobReport.setApplicationStatus('Check_dst1: Failed')
    DIRAC.exit( -1 )

#######################Check stdout of CheckDST.C macro ##########################
  DIRAC.gLogger.notice('Executing DST Check step2')
  check_script = hessroot + '/hapscripts/dst/check_dst2.csh'
  cmdTuple = [check_script]
  ret = systemCall( 0, cmdTuple, sendOutput )
       
  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute DST Check step2')
    jobReport.setApplicationStatus('Check_dst2: Failed')
    DIRAC.exit( -1 )

  status, stdout, stderr = ret['Value']
  if status==1:
    jobReport.setApplicationStatus('DST Check step2: Big problem during the DST production')
    DIRAC.gLogger.error( 'DST Check step2 reports: Big problem during the DST production' )
    DIRAC.exit( -1 )
  if status==2:
    jobReport.setApplicationStatus('DST Check step2: No triggered events')
    DIRAC.gLogger.notice( 'DST Check step2 reports: No triggered events' )
    DIRAC.exit( )

  DIRAC.exit()
Ejemplo n.º 20
0
  if arguments.has_key('WorkingDirectory'):
    wdir = os.path.expandvars(arguments['WorkingDirectory'])
    if os.path.isdir(wdir):
      os.chdir(wdir)
    else:
      try:
        os.makedirs(wdir)
        if os.path.isdir(wdir):
          os.chdir(wdir)
      except Exception, x:
        gLogger.exception('JobWrapperTemplate could not create working directory')
        rescheduleFailedJob(jobID,'Could Not Create Working Directory')
        return 1

  #root = arguments['CE']['Root']
  jobReport = JobReport(jobID,'JobWrapper')

  try:
    job = JobWrapper( jobID, jobReport )
    job.initialize(arguments)
  except Exception, x:
    gLogger.exception('JobWrapper failed the initialization phase')
    rescheduleFailedJob(jobID,'Job Wrapper Initialization')
    job.sendWMSAccounting('Failed','Job Wrapper Initialization')
    return 1

  if arguments['Job'].has_key('InputSandbox'):
    jobReport.commit()
    try:
      result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
      if not result['OK']:
Ejemplo n.º 21
0
def main():

  from DIRAC.Core.Base import Script

  Script.registerSwitch( "p:", "run_number=", "Run Number", setRunNumber )
  Script.registerSwitch( "R:", "run=", "Run", setRun )
  Script.registerSwitch( "P:", "config_path=", "Config Path", setConfigPath )
  Script.registerSwitch( "T:", "template=", "Template", setTemplate )
  Script.registerSwitch( "E:", "executable=", "Executable", setExecutable )
  Script.registerSwitch( "V:", "version=", "Version", setVersion )
  Script.registerSwitch( "M:", "mode=", "Mode", setMode )
  
  Script.parseCommandLine( ignoreErrors = True )
  args = Script.getPositionalArgs()

  if len( args ) < 1:
    Script.showHelp()
  
  if version == None or executable == None or run_number == None or run == None or template == None:
    Script.showHelp()
    jobReport.setApplicationStatus('Options badly specified')
    DIRAC.exit( -1 )

  from CTADIRAC.Core.Workflow.Modules.CorsikaApp import CorsikaApp
  from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
  from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
  from DIRAC.Core.Utilities.Subprocess import systemCall
  from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

  jobID = os.environ['JOBID']
  jobID = int( jobID )
  jobReport = JobReport( jobID )

  CorsikaSimtelPack = 'corsika_simhessarray/' + version + '/corsika_simhessarray'

  packs = [CorsikaSimtelPack]

  for package in packs:
    DIRAC.gLogger.notice( 'Checking:', package )
    if sharedArea:
      if checkSoftwarePackage( package, sharedArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Shared Area:', package )
        installSoftwareEnviron( package, workingArea() )
        packageTuple =  package.split('/')
        corsika_subdir = sharedArea() + '/' + packageTuple[0] + '/' + version
        cmd = 'cp -r ' + corsika_subdir + '/* .'        
        os.system(cmd)
        continue
    if workingArea:
      if checkSoftwarePackage( package, workingArea() )['OK']:
        DIRAC.gLogger.notice( 'Package found in Local Area:', package )
        continue
      if installSoftwarePackage( package, workingArea() )['OK']:
      ############## compile #############################
        if version == 'clean_23012012':
          cmdTuple = ['./build_all','ultra','qgs2']
        elif version in ['prod-2_21122012','prod-2_08032013','prod-2_06052013']:
          cmdTuple = ['./build_all','prod2','qgs2']
        ret = systemCall( 0, cmdTuple, sendOutput)
        if not ret['OK']:
          DIRAC.gLogger.error( 'Failed to execute build')
          DIRAC.exit( -1 )
        continue

    DIRAC.gLogger.error( 'Check Failed for software package:', package )
    DIRAC.gLogger.error( 'Software package not available')
    DIRAC.exit( -1 )  


  cs = CorsikaApp()

  cs.setSoftwarePackage(CorsikaSimtelPack)

  cs.csExe = executable

  cs.csArguments = ['--run-number',run_number,'--run',run,template]

  corsikaReturnCode = cs.execute()
  
  if corsikaReturnCode != 0:
    DIRAC.gLogger.error( 'Failed to execute corsika Application')
    jobReport.setApplicationStatus('Corsika Application: Failed')
    DIRAC.exit( -1 )
    
###### rename corsika file #################################
  rundir = 'run' + run_number
  corsikaKEYWORDS = ['TELFIL']
  dictCorsikaKW = fileToKWDict(template,corsikaKEYWORDS)
  corsikafilename = rundir + '/' + dictCorsikaKW['TELFIL'][0]
  destcorsikafilename = 'corsika_run' + run_number + '.corsika.gz'
  cmd = 'mv ' + corsikafilename + ' ' + destcorsikafilename
  os.system(cmd)
 
  ### create corsika tar ####################
  corsika_tar = 'corsika_run' + run_number + '.tar.gz'
  filetar1 = rundir + '/'+'input'
  filetar2 = rundir + '/'+ 'DAT' + run_number + '.dbase'
  filetar3 = rundir + '/run' + str(int(run_number)) + '.log'
  cmdTuple = ['/bin/tar','zcf',corsika_tar, filetar1,filetar2,filetar3]
  DIRAC.gLogger.notice( 'Executing command tuple:', cmdTuple )
  ret = systemCall( 0, cmdTuple, sendOutput)
  if not ret['OK']:
    DIRAC.gLogger.error( 'Failed to execute tar')
    DIRAC.exit( -1 )
    
  DIRAC.exit()
Ejemplo n.º 22
0
def execute(arguments):
    """ The only real function executed here
  """

    global gJobReport

    jobID = arguments['Job'].get('JobID', 0)
    os.environ['JOBID'] = str(jobID)
    jobID = int(jobID)

    if 'WorkingDirectory' in arguments:
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        'JobWrapperTemplate found that the working directory already exists'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Working Directory already exists')
                else:
                    gLogger.exception(
                        'JobWrapperTemplate could not create working directory'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception('JobWrapper failed the initialization phase',
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        try:
            job.sendJobAccounting(rescheduleResult,
                                  'Job Wrapper Initialization')
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception('JobWrapper failed sending job accounting',
                              lException=exc)
        return 1

    if 'InputSandbox' in arguments['Job']:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while downloading input sandbox',
                lException=exc)
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if 'InputData' in arguments['Job']:
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except JobWrapperError:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
            except Exception as exc:  # pylint: disable=broad-except
                gLogger.exception(
                    'JobWrapper raised exception while resolving input data',
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
    else:
        gLogger.verbose('Job has no InputData requirement')

    gJobReport.commit()

    try:
        result = job.execute()
        if not result['OK']:
            gLogger.error('Failed to execute job', result['Message'])
            raise JobWrapperError((result['Message'], result['Errno']))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'JobWrapper execution',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
            return 1
        gLogger.exception('Job failed in execution phase')
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception('Job raised exception during execution phase',
                          lException=exc)
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1

    if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
        try:
            result = job.processJobOutputs()
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError as exc:
            gLogger.exception('JobWrapper failed to process output files')
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while processing output files',
                lException=exc)
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
    else:
        gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

    try:
        # Failed jobs will return 1 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception(
            'JobWrapper raised exception during the finalization phase',
            lException=exc)
        return 2
Ejemplo n.º 23
0
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception, x:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleFailedJob(jobID,
                                    'Could Not Create Working Directory')
                return 1

    #root = arguments['CE']['Root']
    jobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, jobReport)
        job.initialize(arguments)
    except Exception, x:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleFailedJob(jobID, 'Job Wrapper Initialization')
        job.sendWMSAccounting('Failed', 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        jobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
Ejemplo n.º 24
0
def execute(arguments):
    """The only real function executed here"""

    global gJobReport

    jobID = arguments["Job"].get("JobID", 0)
    os.environ["JOBID"] = str(jobID)
    jobID = int(jobID)

    if "WorkingDirectory" in arguments:
        wdir = os.path.expandvars(arguments["WorkingDirectory"])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        "JobWrapperTemplate found that the working directory already exists"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Working Directory already exists")
                else:
                    gLogger.exception(
                        "JobWrapperTemplate could not create working directory"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Could Not Create Working Directory")
                return 1

    gJobReport = JobReport(jobID, "JobWrapper")

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("JobWrapper failed the initialization phase",
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(
            jobID=jobID,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION,
            jobReport=gJobReport)
        job.sendJobAccounting(
            status=rescheduleResult,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION)
        return 1

    if "InputSandbox" in arguments["Job"]:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments["Job"]["InputSandbox"])
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError:
            gLogger.exception("JobWrapper failed to download input sandbox")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while downloading input sandbox",
                lException=exc)
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
    else:
        gLogger.verbose("Job has no InputSandbox requirement")

    gJobReport.commit()

    if "InputData" in arguments["Job"]:
        if arguments["Job"]["InputData"]:
            try:
                result = job.resolveInputData()
                if not result["OK"]:
                    gLogger.warn(result["Message"])
                    raise JobWrapperError(result["Message"])
            except JobWrapperError:
                gLogger.exception("JobWrapper failed to resolve input data")
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
            except Exception as exc:  # pylint: disable=broad-except
                gLogger.exception(
                    "JobWrapper raised exception while resolving input data",
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
        else:
            gLogger.verbose("Job has a null InputData requirement:")
            gLogger.verbose(arguments)
    else:
        gLogger.verbose("Job has no InputData requirement")

    gJobReport.commit()

    try:
        result = job.execute()
        if not result["OK"]:
            gLogger.error("Failed to execute job", result["Message"])
            raise JobWrapperError((result["Message"], result["Errno"]))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == "0":
            gLogger.verbose("JobWrapper exited with status=0 after execution")
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
            return 1
        gLogger.exception("Job failed in execution phase")
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("Job raised exception during execution phase",
                          lException=exc)
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1

    if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]:
        try:
            result = job.processJobOutputs()
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError as exc:
            gLogger.exception("JobWrapper failed to process output files")
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)

            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while processing output files",
                lException=exc)
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)
            return 2
    else:
        gLogger.verbose("Job has no OutputData or OutputSandbox requirement")

    try:
        # Failed jobs will return !=0 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception(
            "JobWrapper raised exception during the finalization phase",
            lException=exc)
        return 2
Ejemplo n.º 25
0
def main():

    from DIRAC.Core.Base import Script

    Script.registerSwitch("p:", "inputfile=", "Input File", setInputFile)
    Script.registerSwitch("E:", "simtelExecName=", "SimtelExecName",
                          setExecutable)
    Script.registerSwitch("S:", "simtelConfig=", "SimtelConfig", setConfig)
    Script.registerSwitch("V:", "version=", "Version", setVersion)
    Script.registerSwitch("D:", "storage_element=", "Storage Element",
                          setStorageElement)

    from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
    from DIRAC.Resources.Catalog.FileCatalog import FileCatalog

    Script.parseCommandLine()
    DIRAC.gLogger.setLevel('INFO')

    global fcc, fcL, storage_element

    from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
    from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
    from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
    from DIRAC.Core.Utilities.Subprocess import systemCall
    from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

    jobID = os.environ['JOBID']
    jobID = int(jobID)
    jobReport = JobReport(jobID)

    ###########
    ## Checking MD coherence
    fc = FileCatalog('LcgFileCatalog')
    res = fc._getCatalogConfigDetails('DIRACFileCatalog')
    print 'DFC CatalogConfigDetails:', res
    res = fc._getCatalogConfigDetails('LcgFileCatalog')
    print 'LCG CatalogConfigDetails:', res

    fcc = FileCatalogClient()
    fcL = FileCatalog('LcgFileCatalog')

    from DIRAC.Interfaces.API.Dirac import Dirac
    dirac = Dirac()
    ############################

    #############
    # simtelConfigFile should be built from ???
    #simtelConfigFilesPath = 'sim_telarray/multi'
    #simtelConfigFile = simtelConfigFilesPath + '/multi_cta-ultra5.cfg'
    #createGlobalsFromConfigFiles(simtelConfigFile)
    #createGlobalsFromConfigFiles(current_version)
    #######################
    ## files spread in 1000-runs subDirectories

    corsikaFileName = os.path.basename(corsikaFileLFN)
    run_number = corsikaFileName.split('run')[1].split('.corsika.gz')[
        0]  # run001412.corsika.gz

    runNum = int(run_number)
    subRunNumber = '%03d' % runNum
    runNumModMille = runNum % 1000
    runNumTrunc = (runNum - runNumModMille) / 1000
    runNumSeriesDir = '%03dxxx' % runNumTrunc
    print 'runNumSeriesDir=', runNumSeriesDir

    f = open('DISABLE_WATCHDOG_CPU_WALLCLOCK_CHECK', 'w')
    f.close()

    ##### If storage element is IN2P3-tape save simtel file on disk ###############
    if storage_element == 'CC-IN2P3-Tape':
        storage_element = 'CC-IN2P3-Disk'

############ Producing SimTel File
######################Building simtel Directory Metadata #######################

    cfg_dict = {
        "4MSST": 'cta-prod2-4m-dc',
        "SCSST": 'cta-prod2-sc-sst',
        "STD": 'cta-prod2',
        "NSBX3": 'cta-prod2',
        "ASTRI": 'cta-prod2-astri',
        "SCMST": 'cta-prod2-sc3'
    }

    if simtelConfig == "6INROW":
        all_configs = ["SCMST", "4MSST", "SCSST", "ASTRI", "NSBX3", "STD"]
    elif simtelConfig == "5INROW":
        all_configs = ["4MSST", "SCSST", "ASTRI", "NSBX3", "STD"]
    else:
        all_configs = [simtelConfig]

    for current_conf in all_configs:

        DIRAC.gLogger.notice('current conf is', current_conf)

        if current_conf == "SCMST":
            current_version = version + '_sc3'
        else:
            current_version = version
            if os.path.isdir('sim_telarray'):
                DIRAC.gLogger.notice(
                    'Package found in the local area. Removing package...')
                cmd = 'rm -R sim_telarray corsika-6990 hessioxxx corsika-run'
                if (os.system(cmd)):
                    DIRAC.exit(-1)

        DIRAC.gLogger.notice('current version is', current_version)
        CorsikaSimtelPack = 'corsika_simhessarray/' + current_version + '/corsika_simhessarray'

        packs = [CorsikaSimtelPack]

        for package in packs:
            DIRAC.gLogger.notice('Checking:', package)
            if sharedArea:
                if checkSoftwarePackage(package, sharedArea())['OK']:
                    DIRAC.gLogger.notice('Package found in Shared Area:',
                                         package)
                    installSoftwareEnviron(package, workingArea())
                    packageTuple = package.split('/')
                    corsika_subdir = sharedArea(
                    ) + '/' + packageTuple[0] + '/' + current_version
                    cmd = 'cp -u -r ' + corsika_subdir + '/* .'
                    os.system(cmd)
                    continue

            DIRAC.gLogger.error('Check Failed for software package:', package)
            DIRAC.gLogger.error('Software package not available')
            DIRAC.exit(-1)

        createGlobalsFromConfigFiles(current_version)

        resultCreateSimtelDirMD = createSimtelFileSystAndMD(
            current_conf, current_version)
        if not resultCreateSimtelDirMD['OK']:
            DIRAC.gLogger.error('Failed to create simtelArray Directory MD')
            jobReport.setApplicationStatus(
                'Failed to create simtelArray Directory MD')
            DIRAC.gLogger.error(
                'Metadata coherence problem, no simtelArray File produced')
            DIRAC.exit(-1)
        else:
            print 'simtel Directory MD successfully created'

############## introduce file existence check here ########################
        simtelFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.simtel.gz'
        simtelDirPath_conf = simtelDirPath + '_' + current_conf
        simtelOutFileDir = os.path.join(simtelDirPath_conf, 'Data',
                                        runNumSeriesDir)
        simtelOutFileLFN = os.path.join(simtelOutFileDir, simtelFileName)

        res = CheckCatalogCoherence(simtelOutFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Current conf already done', current_conf)
            continue

#### execute simtelarray ################
        fd = open('run_sim.sh', 'w')
        fd.write("""#! /bin/sh  
  export SVNPROD2=$PWD
  export SVNTAG=SVN-PROD2_rev1869
  export CORSIKA_IO_BUFFER=800MB
  ./grid_prod2-repro.sh %s %s""" % (corsikaFileName, current_conf))
        fd.close()

        os.system('chmod u+x run_sim.sh')

        cmdTuple = ['./run_sim.sh']
        ret = systemCall(0, cmdTuple, sendOutputSimTel)
        simtelReturnCode, stdout, stderr = ret['Value']

        if (os.system('grep Broken simtel.log')):
            DIRAC.gLogger.notice('not broken')
        else:
            DIRAC.gLogger.notice('broken')

            # Tag corsika File if Broken Pipe
            corsikaTagMD = {}
            corsikaTagMD['CorsikaToReprocess'] = 'CorsikaToReprocess'
            result = fcc.setMetadata(corsikaFileLFN, corsikaTagMD)
            print "result setMetadata=", result
            if not result['OK']:
                print 'ResultSetMetadata:', result['Message']

            jobReport.setApplicationStatus('Broken pipe')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Failed to execute run_sim.sh')
            DIRAC.gLogger.error('run_sim.sh status is:', simtelReturnCode)
            DIRAC.exit(-1)

## putAndRegister simtel data/log/histo Output File:
        cfg = cfg_dict[current_conf]
        cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Data/*.simtel.gz ' + simtelFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)

############################################
        simtelRunNumberSeriesDirExist = fcc.isDirectory(
            simtelOutFileDir)['Value']['Successful'][simtelOutFileDir]
        newSimtelRunFileSeriesDir = (
            simtelRunNumberSeriesDirExist != True
        )  # if new runFileSeries, will need to add new MD

        simtelLogFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(obslev) + '_' + 'run' + run_number + '.log.gz'
        cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Log/*.log.gz ' + simtelLogFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)
        simtelOutLogFileDir = os.path.join(simtelDirPath_conf, 'Log',
                                           runNumSeriesDir)
        simtelOutLogFileLFN = os.path.join(simtelOutLogFileDir,
                                           simtelLogFileName)

        simtelHistFileName = particle + '_' + str(thetaP) + '_' + str(
            phiP) + '_alt' + str(
                obslev) + '_' + 'run' + run_number + '.hdata.gz'
        cmd = 'mv Data/sim_telarray/' + cfg + '/0.0deg/Histograms/*.hdata.gz ' + simtelHistFileName
        if (os.system(cmd)):
            DIRAC.exit(-1)
        simtelOutHistFileDir = os.path.join(simtelDirPath_conf, 'Histograms',
                                            runNumSeriesDir)
        simtelOutHistFileLFN = os.path.join(simtelOutHistFileDir,
                                            simtelHistFileName)

        ################################################
        DIRAC.gLogger.notice('Put and register simtel File in LFC and DFC:',
                             simtelOutFileLFN)
        ret = dirac.addFile(simtelOutFileLFN, simtelFileName, storage_element)

        res = CheckCatalogCoherence(simtelOutFileLFN)
        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Job failed: Catalog Coherence problem found')
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Error during addFile call:', ret['Message'])
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)
######################################################################

        res = CheckCatalogCoherence(simtelOutLogFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Log file already exists. Removing:',
                                 simtelOutLogFileLFN)
            ret = dirac.removeFile(simtelOutLogFileLFN)

        DIRAC.gLogger.notice(
            'Put and register simtel Log File in LFC and DFC:',
            simtelOutLogFileLFN)
        ret = dirac.addFile(simtelOutLogFileLFN, simtelLogFileName,
                            storage_element)

        res = CheckCatalogCoherence(simtelOutLogFileLFN)
        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Job failed: Catalog Coherence problem found')
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Error during addFile call:', ret['Message'])
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)
######################################################################

        res = CheckCatalogCoherence(simtelOutHistFileLFN)
        if res == DIRAC.S_OK:
            DIRAC.gLogger.notice('Histo file already exists. Removing:',
                                 simtelOutHistFileLFN)
            ret = dirac.removeFile(simtelOutHistFileLFN)

        DIRAC.gLogger.notice(
            'Put and register simtel Histo File in LFC and DFC:',
            simtelOutHistFileLFN)
        ret = dirac.addFile(simtelOutHistFileLFN, simtelHistFileName,
                            storage_element)

        res = CheckCatalogCoherence(simtelOutHistFileLFN)
        if res != DIRAC.S_OK:
            DIRAC.gLogger.error('Job failed: Catalog Coherence problem found')
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)

        if not ret['OK']:
            DIRAC.gLogger.error('Error during addFile call:', ret['Message'])
            jobReport.setApplicationStatus('OutputData Upload Error')
            DIRAC.exit(-1)
######################################################################

        if newSimtelRunFileSeriesDir:
            insertRunFileSeriesMD(simtelOutFileDir, runNumTrunc)
            insertRunFileSeriesMD(simtelOutLogFileDir, runNumTrunc)
            insertRunFileSeriesMD(simtelOutHistFileDir, runNumTrunc)


###### simtel File level metadata ############################################
        simtelFileMD = {}
        simtelFileMD['runNumber'] = int(run_number)
        simtelFileMD['jobID'] = jobID
        simtelFileMD['simtelReturnCode'] = simtelReturnCode

        result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(simtelOutLogFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.setMetadata(simtelOutHistFileLFN, simtelFileMD)
        print "result setMetadata=", result
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

        result = fcc.addFileAncestors(
            {simtelOutFileLFN: {
                'Ancestors': [corsikaFileLFN]
            }})
        print 'result addFileAncestor:', result

        result = fcc.addFileAncestors(
            {simtelOutLogFileLFN: {
                'Ancestors': [corsikaFileLFN]
            }})
        print 'result addFileAncestor:', result

        result = fcc.addFileAncestors(
            {simtelOutHistFileLFN: {
                'Ancestors': [corsikaFileLFN]
            }})
        print 'result addFileAncestor:', result

        result = fcc.setMetadata(simtelOutFileLFN, simtelFileMD)
        if not result['OK']:
            print 'ResultSetMetadata:', result['Message']

    DIRAC.exit()
Ejemplo n.º 26
0
'''
Created on 2015-05-19 21:45:37

@author: suo
'''
import sys
from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
from DIRAC.Core.Base import Script
Script.parseCommandLine( ignoreErrors = False )

jobID = sys.argv[1]
experiment = sys.argv[2]
message = sys.argv[3]

jobReport = JobReport(jobID,experiment)
result = jobReport.setApplicationStatus(message)
if not result['OK']:
    try:
        with open('job.err','a') as errFile:
            print >> errFile, 'setJobStatus error: %s' % result
    except IOError:
        print 'IOError:',str(e)
Ejemplo n.º 27
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)
    # Fix in the environment to get a reasonable performance from dCache,
    # until we move to a new version of root
    #  os.environ['DCACHE_RAHEAD'] = str(1)
    #  os.environ['DCACHE_RA_BUFFER'] = str(50*1024)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Could Not Create Working Directory')
                return 1

    #root = arguments['CE']['Root']
    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        job.sendWMSAccounting(rescheduleResult, 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendWMSAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception, x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendWMSAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
Ejemplo n.º 28
0
def execute ( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )
  # Fix in the environment to get a reasonable performance from dCache,
  # until we move to a new version of root
#  os.environ['DCACHE_RAHEAD'] = str(1)
#  os.environ['DCACHE_RA_BUFFER'] = str(50*1024)

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  #root = arguments['CE']['Root']
  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the initialization phase' )
    rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    job.sendWMSAccounting( 'Failed', 'Job Wrapper Initialization' )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleFailedJob( jobID, 'Input Sandbox Download' )
      job.sendWMSAccounting( 'Failed', 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception, x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleFailedJob( jobID, 'Input Data Resolution' )
        job.sendWMSAccounting( 'Failed', 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
Ejemplo n.º 29
0
def main():

    from DIRAC.Core.Base import Script

    #### eventio_cta options ##########################################
    Script.registerSwitch("I:", "infile=", "Input file", setInfile)
    Script.registerSwitch("O:", "outfile=", "Output file", setOutfile)
    Script.registerSwitch("T:", "tellist=", "Tellist", setTellist)
    Script.registerSwitch("F:", "Nfirst_mcevt=", "Nfirst_mcevt",
                          setNfirst_mcevt)
    Script.registerSwitch("L:", "Nlast_mcevt=", "Nlast_mcevt", setNlast_mcevt)
    Script.registerSwitch("P:", "pixelslices=", "setPixelslices (true/false)",
                          setPixelslices)
    ### other options ###############################################
    Script.registerSwitch("V:", "version=", "HAP version", setVersion)

    Script.parseCommandLine(ignoreErrors=True)

    args = Script.getPositionalArgs()

    if len(args) < 1:
        Script.showHelp()

    if outfile == None or infile == None or tellist == None or version == None:
        Script.showHelp()
        jobReport.setApplicationStatus('Options badly specified')
        DIRAC.exit(-1)

    from CTADIRAC.Core.Workflow.Modules.HapApplication import HapApplication
    from CTADIRAC.Core.Workflow.Modules.HapRootMacro import HapRootMacro
    from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
    from CTADIRAC.Core.Utilities.SoftwareInstallation import getSoftwareEnviron
    from CTADIRAC.Core.Utilities.SoftwareInstallation import localArea
    from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
    from DIRAC.Core.Utilities.Subprocess import systemCall
    from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

    jobID = os.environ['JOBID']
    jobID = int(jobID)
    jobReport = JobReport(jobID)

    HapPack = 'HAP/' + version + '/HAP'

    packs = ['HESS/v0.2/lib', 'HESS/v0.3/root', HapPack]

    for package in packs:
        DIRAC.gLogger.notice('Checking:', package)
        if sharedArea:
            if checkSoftwarePackage(package, sharedArea())['OK']:
                DIRAC.gLogger.notice('Package found in Shared Area:', package)
                continue
        if localArea:
            if checkSoftwarePackage(package, localArea())['OK']:
                DIRAC.gLogger.notice('Package found in Local Area:', package)
                continue
            if installSoftwarePackage(package, localArea())['OK']:
                continue
        DIRAC.gLogger.error('Check Failed for software package:', package)
        DIRAC.gLogger.error('Software package not available')
        DIRAC.exit(-1)

    telconf = os.path.join(localArea(),
                           'HAP/%s/config/%s' % (version, tellist))

    ha = HapApplication()
    ha.setSoftwarePackage(HapPack)
    ha.hapExecutable = 'eventio_cta'
    ha.hapArguments = ['-file', infile, '-o', outfile, '-tellist', telconf]

    try:
        ha.hapArguments.extend(
            ['-Nfirst_mcevt', Nfirst_mcevt, '-Nlast_mcevt', Nlast_mcevt])
    except NameError:
        DIRAC.gLogger.info('Nfirst_mcevt/Nlast_mcevt options are not used')

    try:
        if (pixelslices == 'true'):
            ha.hapArguments.extend(['-pixelslices'])
    except NameError:
        DIRAC.gLogger.info('pixelslices option is not used')

    DIRAC.gLogger.notice('Executing Hap Converter Application')
    res = ha.execute()

    if not res['OK']:
        DIRAC.gLogger.error('Failed to execute eventio_cta Application')
        jobReport.setApplicationStatus('eventio_cta: Failed')
        DIRAC.exit(-1)

    if not os.path.isfile(outfile):
        error = 'raw file was not created:'
        DIRAC.gLogger.error(error, outfile)
        jobReport.setApplicationStatus('eventio_cta: RawData not created')
        DIRAC.exit(-1)

###################### Check RAW DATA #######################
    hr = HapRootMacro()
    hr.setSoftwarePackage(HapPack)

    DIRAC.gLogger.notice('Executing RAW check step0')
    hr.rootMacro = '/hapscripts/dst/Open_Raw.C+'
    outfilestr = '"' + outfile + '"'
    args = [outfilestr]
    DIRAC.gLogger.notice('Open_Raw macro Arguments:', args)
    hr.rootArguments = args
    DIRAC.gLogger.notice('Executing Hap Open_Raw macro')
    res = hr.execute()

    if not res['OK']:
        DIRAC.gLogger.error('Open_Raw: Failed')
        DIRAC.exit(-1)


#########################Quality Check for raw Output File: step1####################
    DIRAC.gLogger.notice('Executing Raw Check step1')

    ret = getSoftwareEnviron(HapPack)
    if not ret['OK']:
        error = ret['Message']
        DIRAC.gLogger.error(error, HapPack)
        DIRAC.exit(-1)

    hapEnviron = ret['Value']
    hessroot = hapEnviron['HESSROOT']
    check_script = hessroot + '/hapscripts/dst/check_raw.csh'
    cmdTuple = [check_script]
    ret = systemCall(0, cmdTuple, sendOutput)

    if not ret['OK']:
        DIRAC.gLogger.error('Failed to execute RAW Check step1')
        jobReport.setApplicationStatus('Check_raw: Failed')
        DIRAC.exit(-1)

    status, stdout, stderr = ret['Value']
    if status == 1:
        jobReport.setApplicationStatus(
            'RAW Check step1: Big problem during RAW production')
        DIRAC.gLogger.error('Check_raw: Big problem during RAW production')
        DIRAC.exit(-1)

    DIRAC.exit()
Ejemplo n.º 30
0
def execute ( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the initialization phase' )
    rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport )
      job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception, x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport )
        job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
Ejemplo n.º 31
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            #Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])
            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No work available', jobRequest['Message']):
                self.log.info('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return S_ERROR('Nothing to do')
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error(jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return S_ERROR('Nothing to do')
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                self.log.error(jobRequest['Message'])
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.info('Failed to get jobs: %s' %
                              (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return S_ERROR('Nothing to do')
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        jobID = matcherInfo['JobID']
        self.pilotInfoReportedFlag = matcherInfo.get('PilotInfoReportedFlag',
                                                     False)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('SystemConfig'):
            self.log.warn(
                'Job has no system configuration defined in JDL parameters')
            systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
            self.log.info(
                'Setting system config to /LocalSite/Architecture = %s since it was not specified'
                % systemConfig)
            if not systemConfig:
                self.log.warn('/LocalSite/Architecture is not defined')
            params['SystemConfig'] = systemConfig
        else:
            systemConfig = params['SystemConfig']
            if systemConfig.lower() == 'any':
                systemConfig = gConfig.getValue('/LocalSite/Architecture', '')
                self.log.info(
                    'Setting SystemConfig = /LocalSite/Architecture =',
                    '"%s" since it was set to "ANY" in the job description' %
                    systemConfig)
                if not systemConfig:
                    self.log.warn('/LocalSite/Architecture is not defined')
                params['SystemConfig'] = systemConfig

        if not params.has_key('MaxCPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s, SystemConfig=%s' %
                      (jobID, jobType, systemConfig))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)
            if self.gridCEQueue:
                jobReport.setJobParameter('GridCEQueue',
                                          self.gridCEQueue,
                                          sendFlag=False)
            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            # self.__setJobSite( jobID, self.siteName )
            if not self.pilotInfoReportedFlag:
                self.__reportPilotInfo(jobID)
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Is this necessary at all?
            saveJDL = self.__saveJobJDLRequest(jobID, jobJDL)
            #self.__report(jobID,'Matched','Job Prepared to Submit')

            #resourceParameters = self.__getJDLParameters( resourceJDL )
            #if not resourceParameters['OK']:
            #  return resourceParameters
            #resourceParams = resourceParameters['Value']

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job %s' %
                               (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.verbose('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, jobJDL, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                return self.__finish(
                    'Payload execution failed with error code %s' %
                    submission['PayloadFailed'], self.stopOnApplicationFailure)

            self.log.verbose('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        result = self.timeLeftUtil.getTimeLeft(0.0)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Ejemplo n.º 32
0
def execute(arguments):
  """ The only real function executed here
  """

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int(jobID)

  if 'WorkingDirectory' in arguments:
    wdir = os.path.expandvars(arguments['WorkingDirectory'])
    if os.path.isdir(wdir):
      os.chdir(wdir)
    else:
      try:
        os.makedirs(wdir)  # this will raise an exception if wdir already exists (which is ~OK)
        if os.path.isdir(wdir):
          os.chdir(wdir)
      except OSError as osError:
        if osError.errno == errno.EEXIST and os.path.isdir(wdir):
          gLogger.exception('JobWrapperTemplate found that the working directory already exists')
          rescheduleResult = rescheduleFailedJob(jobID, 'Working Directory already exists')
        else:
          gLogger.exception('JobWrapperTemplate could not create working directory')
          rescheduleResult = rescheduleFailedJob(jobID, 'Could Not Create Working Directory')
        return 1

  gJobReport = JobReport(jobID, 'JobWrapper')

  try:
    job = JobWrapper(jobID, gJobReport)
    job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper failed the initialization phase', lException=exc)
    rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport)
    try:
      job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization')
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper failed sending job accounting', lException=exc)
    return 1

  if 'InputSandbox' in arguments['Job']:
    gJobReport.commit()
    try:
      result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError:
      gLogger.exception('JobWrapper failed to download input sandbox')
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while downloading input sandbox', lException=exc)
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
  else:
    gLogger.verbose('Job has no InputSandbox requirement')

  gJobReport.commit()

  if 'InputData' in arguments['Job']:
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn(result['Message'])
          raise JobWrapperError(result['Message'])
      except JobWrapperError:
        gLogger.exception('JobWrapper failed to resolve input data')
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
      except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception('JobWrapper raised exception while resolving input data', lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
    else:
      gLogger.verbose('Job has a null InputData requirement:')
      gLogger.verbose(arguments)
  else:
    gLogger.verbose('Job has no InputData requirement')

  gJobReport.commit()

  try:
    result = job.execute(arguments)
    if not result['OK']:
      gLogger.error('Failed to execute job', result['Message'])
      raise JobWrapperError((result['Message'], result['Errno']))
  except JobWrapperError as exc:
    if exc.value[1] == 0 or str(exc.value[0]) == '0':
      gLogger.verbose('JobWrapper exited with status=0 after execution')
    if exc.value[1] == DErrno.EWMSRESC:
      gLogger.warn("Asked to reschedule job")
      rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
      return 1
    gLogger.exception('Job failed in execution phase')
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus(
        'Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('Job raised exception during execution phase', lException=exc)
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1

  if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
    try:
      result = job.processJobOutputs()
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError as exc:
      gLogger.exception('JobWrapper failed to process output files')
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while processing output files', lException=exc)
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
  else:
    gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize()
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper raised exception during the finalization phase', lException=exc)
    return 2
Ejemplo n.º 33
0
    def execute(self):
        """The JobAgent execution method.
    """
        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                if self.timeLeftError:
                    self.log.warn(self.timeLeftError)
                    return self.__finish(self.timeLeftError)
                self.log.info('%s normalized CPU units remaining in slot' %
                              (self.timeLeft))
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        self.log.verbose('Job Agent execution loop')
        available = self.computingElement.available()
        if not available['OK'] or not available['Value']:
            self.log.info('Resource is not available')
            self.log.info(available['Message'])
            return self.__finish('CE Not Available')

        self.log.info(available['Message'])

        result = self.computingElement.getDescription()
        if not result['OK']:
            return result
        ceDict = result['Value']

        # Add pilot information
        gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
        if gridCE != 'Unknown':
            ceDict['GridCE'] = gridCE
        if not 'PilotReference' in ceDict:
            ceDict['PilotReference'] = str(self.pilotReference)
        ceDict['PilotBenchmark'] = self.cpuFactor
        ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

        # Add possible job requirements
        result = gConfig.getOptionsDict('/AgentJobRequirements')
        if result['OK']:
            requirementsDict = result['Value']
            ceDict.update(requirementsDict)

        self.log.verbose(ceDict)
        start = time.time()
        jobRequest = self.__requestJob(ceDict)
        matchTime = time.time() - start
        self.log.info('MatcherTime = %.2f (s)' % (matchTime))

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:
            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK: %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs: %s' %
                                (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if not matcherInfo.has_key(param):
                self.__report(jobID, 'Failed',
                              'Matcher did not return %s' % (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                self.__report(jobID, 'Failed',
                              'Matcher returned null %s' % (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned %s = %s ' %
                                 (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo.keys():
            if not key in matcherParams:
                value = matcherInfo[key]
                optimizerParams[key] = value

        parameters = self.__getJDLParameters(jobJDL)
        if not parameters['OK']:
            self.__report(jobID, 'Failed', 'Could Not Extract JDL Parameters')
            self.log.warn(parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if not params.has_key('JobID'):
            msg = 'Job has not JobID defined in JDL parameters'
            self.__report(jobID, 'Failed', msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if not params.has_key('JobType'):
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if not params.has_key('CPUTime'):
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        if self.extraOptions:
            params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n %s' %
                         (jobRequest['Value']))
        self.log.info('Received JobID=%s, JobType=%s' % (jobID, jobType))
        self.log.info('OwnerDN: %s JobGroup: %s' % (ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
            jobReport.setJobParameter('MatcherServiceTime',
                                      str(matchTime),
                                      sendFlag=False)

            if os.environ.has_key('BOINC_JOB_ID'):
                # Report BOINC environment
                for p in [
                        'BoincUserID', 'BoincHostID', 'BoincHostPlatform',
                        'BoincHostName'
                ]:
                    jobReport.setJobParameter(p,
                                              gConfig.getValue(
                                                  '/LocalSite/%s' % p,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus('Matched', 'Job Received by Agent')
            result = self.__setupProxy(ownerDN, jobGroup)
            if not result['OK']:
                return self.__rescheduleFailedJob(
                    jobID, result['Message'], self.stopOnApplicationFailure)
            if 'Value' in result and result['Value']:
                proxyChain = result['Value']

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self.__checkInstallSoftware(jobID, params, ceDict)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self.__rescheduleFailedJob(
                    jobID, errorMsg, self.stopOnApplicationFailure)

            self.log.debug('Before %sCE submitJob()' % (self.ceName))
            submission = self.__submitJob(jobID, params, ceDict,
                                          optimizerParams, proxyChain)
            if not submission['OK']:
                self.__report(jobID, 'Failed', submission['Message'])
                return self.__finish(submission['Message'])
            elif 'PayloadFailed' in submission:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % submission[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception:
            self.log.exception()
            return self.__rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        currentTimes = list(os.times())
        for i in range(len(currentTimes)):
            currentTimes[i] -= self.initTimes[i]

        utime, stime, cutime, cstime, _elapsed = currentTimes
        cpuTime = utime + stime + cutime + cstime

        result = self.timeLeftUtil.getTimeLeft(cpuTime)
        if result['OK']:
            self.timeLeft = result['Value']
        else:
            if result['Message'] != 'Current batch system is not supported':
                self.timeLeftError = result['Message']
            else:
                if self.cpuFactor:
                    # if the batch system is not defined used the CPUNormalizationFactor
                    # defined locally
                    self.timeLeft = self.__getCPUTimeLeft()
        scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

        self.__setJobParam(jobID, 'ScaledCPUTime',
                           str(scaledCPUTime - self.scaledCPUTime))
        self.scaledCPUTime = scaledCPUTime

        return S_OK('Job Agent cycle complete')
Ejemplo n.º 34
0
  def execute( self ):
    """The JobAgent execution method.
    """
    if self.jobCount:
      #Only call timeLeft utility after a job has been picked up
      self.log.info( 'Attempting to check CPU time left for filling mode' )
      if self.fillingMode:
        if self.timeLeftError:
          self.log.warn( self.timeLeftError )
          return self.__finish( self.timeLeftError )
        self.log.info( '%s normalized CPU units remaining in slot' % ( self.timeLeft ) )
        # Need to update the Configuration so that the new value is published in the next matching request
        result = self.computingElement.setCPUTimeLeft( cpuTimeLeft = self.timeLeft )
        if not result['OK']:
          return self.__finish( result['Message'] )
        
        # Update local configuration to be used by submitted job wrappers
        localCfg = CFG()
        if self.extraOptions:
          localConfigFile = os.path.join( '.', self.extraOptions )
        else:
          localConfigFile = os.path.join( rootPath, "etc", "dirac.cfg" )
        localCfg.loadFromFile( localConfigFile )
        if not localCfg.isSection('/LocalSite'):
          localCfg.createNewSection('/LocalSite')
        localCfg.setOption( '/LocalSite/CPUTimeLeft', self.timeLeft )
        localCfg.writeToFile( localConfigFile )
        
      else:
        return self.__finish( 'Filling Mode is Disabled' )

    self.log.verbose( 'Job Agent execution loop' )
    available = self.computingElement.available()
    if not available['OK'] or not available['Value']:
      self.log.info( 'Resource is not available' )
      self.log.info( available['Message'] )
      return self.__finish( 'CE Not Available' )

    self.log.info( available['Message'] )

    result = self.computingElement.getDescription()
    if not result['OK']:
      return result
    ceDict = result['Value']

    # Add pilot information
    gridCE = gConfig.getValue( 'LocalSite/GridCE', 'Unknown' )
    if gridCE != 'Unknown':
      ceDict['GridCE'] = gridCE
    if not 'PilotReference' in ceDict:
      ceDict['PilotReference'] = str( self.pilotReference )
    ceDict['PilotBenchmark'] = self.cpuFactor
    ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

    # Add possible job requirements
    result = gConfig.getOptionsDict( '/AgentJobRequirements' )
    if result['OK']:
      requirementsDict = result['Value']
      ceDict.update( requirementsDict )

    self.log.verbose( ceDict )
    start = time.time()
    jobRequest = self.__requestJob( ceDict )
    matchTime = time.time() - start
    self.log.info( 'MatcherTime = %.2f (s)' % ( matchTime ) )

    self.stopAfterFailedMatches = self.am_getOption( 'StopAfterFailedMatches', self.stopAfterFailedMatches )

    if not jobRequest['OK']:
      if re.search( 'No match found', jobRequest['Message'] ):
        self.log.notice( 'Job request OK: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "seconds timeout" ) != -1:
        self.log.error( jobRequest['Message'] )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )
      elif jobRequest['Message'].find( "Pilot version does not match" ) != -1 :
        self.log.error( jobRequest['Message'] )
        return S_ERROR( jobRequest['Message'] )
      else:
        self.log.notice( 'Failed to get jobs: %s' % ( jobRequest['Message'] ) )
        self.matchFailedCount += 1
        if self.matchFailedCount > self.stopAfterFailedMatches:
          return self.__finish( 'Nothing to do for more than %d cycles' % self.stopAfterFailedMatches )
        return S_OK( jobRequest['Message'] )

    # Reset the Counter
    self.matchFailedCount = 0

    matcherInfo = jobRequest['Value']
    jobID = matcherInfo['JobID']
    if not self.pilotInfoReportedFlag:
      # Check the flag after the first access to the Matcher
      self.pilotInfoReportedFlag = matcherInfo.get( 'PilotInfoReportedFlag', False )
    matcherParams = ['JDL', 'DN', 'Group']
    for param in matcherParams:
      if not matcherInfo.has_key( param ):
        self.__report( jobID, 'Failed', 'Matcher did not return %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      elif not matcherInfo[param]:
        self.__report( jobID, 'Failed', 'Matcher returned null %s' % ( param ) )
        return self.__finish( 'Matcher Failed' )
      else:
        self.log.verbose( 'Matcher returned %s = %s ' % ( param, matcherInfo[param] ) )

    jobJDL = matcherInfo['JDL']
    jobGroup = matcherInfo['Group']
    ownerDN = matcherInfo['DN']

    optimizerParams = {}
    for key in matcherInfo.keys():
      if not key in matcherParams:
        value = matcherInfo[key]
        optimizerParams[key] = value

    parameters = self.__getJDLParameters( jobJDL )
    if not parameters['OK']:
      self.__report( jobID, 'Failed', 'Could Not Extract JDL Parameters' )
      self.log.warn( parameters['Message'] )
      return self.__finish( 'JDL Problem' )

    params = parameters['Value']
    if not params.has_key( 'JobID' ):
      msg = 'Job has not JobID defined in JDL parameters'
      self.__report( jobID, 'Failed', msg )
      self.log.warn( msg )
      return self.__finish( 'JDL Problem' )
    else:
      jobID = params['JobID']

    if not params.has_key( 'JobType' ):
      self.log.warn( 'Job has no JobType defined in JDL parameters' )
      jobType = 'Unknown'
    else:
      jobType = params['JobType']

    if not params.has_key( 'CPUTime' ):
      self.log.warn( 'Job has no CPU requirement defined in JDL parameters' )

    if self.extraOptions:
      params['Arguments'] = params['Arguments'] + ' ' + self.extraOptions
      params['ExtraOptions'] = self.extraOptions

    self.log.verbose( 'Job request successful: \n %s' % ( jobRequest['Value'] ) )
    self.log.info( 'Received JobID=%s, JobType=%s' % ( jobID, jobType ) )
    self.log.info( 'OwnerDN: %s JobGroup: %s' % ( ownerDN, jobGroup ) )
    self.jobCount += 1
    try:
      jobReport = JobReport( jobID, 'JobAgent@%s' % self.siteName )
      jobReport.setJobParameter( 'MatcherServiceTime', str( matchTime ), sendFlag = False )

      if os.environ.has_key( 'BOINC_JOB_ID' ):
        # Report BOINC environment 
        for p in ['BoincUserID', 'BoincHostID', 'BoincHostPlatform', 'BoincHostName']:
          jobReport.setJobParameter( p, gConfig.getValue( '/LocalSite/%s' % p, 'Unknown' ), sendFlag = False )

      jobReport.setJobStatus( 'Matched', 'Job Received by Agent' )
      result = self.__setupProxy( ownerDN, jobGroup )
      if not result[ 'OK' ]:
        return self.__rescheduleFailedJob( jobID, result[ 'Message' ], self.stopOnApplicationFailure )
      if 'Value' in result and result[ 'Value' ]:
        proxyChain = result[ 'Value' ]

      # Save the job jdl for external monitoring
      self.__saveJobJDLRequest( jobID, jobJDL )

      software = self.__checkInstallSoftware( jobID, params, ceDict )
      if not software['OK']:
        self.log.error( 'Failed to install software for job %s' % ( jobID ) )
        errorMsg = software['Message']
        if not errorMsg:
          errorMsg = 'Failed software installation'
        return self.__rescheduleFailedJob( jobID, errorMsg, self.stopOnApplicationFailure )

      self.log.debug( 'Before %sCE submitJob()' % ( self.ceName ) )
      submission = self.__submitJob( jobID, params, ceDict, optimizerParams, proxyChain )
      if not submission['OK']:
        self.__report( jobID, 'Failed', submission['Message'] )
        return self.__finish( submission['Message'] )
      elif 'PayloadFailed' in submission:
        # Do not keep running and do not overwrite the Payload error
        return self.__finish( 'Payload execution failed with error code %s' % submission['PayloadFailed'],
                              self.stopOnApplicationFailure )

      self.log.debug( 'After %sCE submitJob()' % ( self.ceName ) )
    except Exception:
      self.log.exception()
      return self.__rescheduleFailedJob( jobID , 'Job processing failed with exception', self.stopOnApplicationFailure )

    currentTimes = list( os.times() )
    for i in range( len( currentTimes ) ):
      currentTimes[i] -= self.initTimes[i]

    utime, stime, cutime, cstime, _elapsed = currentTimes
    cpuTime = utime + stime + cutime + cstime

    result = self.timeLeftUtil.getTimeLeft( cpuTime )
    if result['OK']:
      self.timeLeft = result['Value']
    else:
      if result['Message'] != 'Current batch system is not supported':
        self.timeLeftError = result['Message']
      else:
        if self.cpuFactor:
          # if the batch system is not defined used the CPUNormalizationFactor 
          # defined locally
          self.timeLeft = self.__getCPUTimeLeft()
    scaledCPUTime = self.timeLeftUtil.getScaledCPU()['Value']

    self.__setJobParam( jobID, 'ScaledCPUTime', str( scaledCPUTime - self.scaledCPUTime ) )
    self.scaledCPUTime = scaledCPUTime

    return S_OK( 'Job Agent cycle complete' )
Ejemplo n.º 35
0
def main():

  from DIRAC.Core.Base import Script
  Script.initialize() 

  DIRAC.gLogger.notice('Platform is:')
  os.system('dirac-platform')
  from DIRAC.DataManagementSystem.Client.DataManager import DataManager
  from CTADIRAC.Core.Workflow.Modules.EvnDispApp import EvnDispApp
  from CTADIRAC.Core.Utilities.SoftwareInstallation import checkSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwarePackage
  from CTADIRAC.Core.Utilities.SoftwareInstallation import installSoftwareEnviron
  from CTADIRAC.Core.Utilities.SoftwareInstallation import sharedArea
  from CTADIRAC.Core.Utilities.SoftwareInstallation import workingArea
  from DIRAC.Core.Utilities.Subprocess import systemCall
  from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport

  jobID = os.environ['JOBID']
  jobID = int( jobID )
  jobReport = JobReport( jobID )

  version = sys.argv[3]
  DIRAC.gLogger.notice( 'Version:', version )

  EvnDispPack = os.path.join('evndisplay',version,'evndisplay')

  packs = [EvnDispPack]

  for package in packs:
    DIRAC.gLogger.notice( 'Checking:', package )
    if checkSoftwarePackage( package, sharedArea() )['OK']:
      DIRAC.gLogger.notice( 'Package found in Shared Area:', package )
      installSoftwareEnviron( package, sharedArea() )
#      cmd = 'cp -r ' + os.path.join(sharedArea(),'evndisplay',version,'EVNDISP.CTA.runparameter') + ' .'
#      if(os.system(cmd)):
#        DIRAC.exit( -1 )
#      cmd = 'cp -r ' + os.path.join(sharedArea(),'evndisplay',version,'Calibration') + ' .'
#      if(os.system(cmd)):
#        DIRAC.exit( -1 )
      continue
    else:
      installSoftwarePackage( package, workingArea() )
      DIRAC.gLogger.notice( 'Package found in workingArea:', package )
      continue

    DIRAC.gLogger.error( 'Check Failed for software package:', package )
    DIRAC.gLogger.error( 'Software package not available')
    DIRAC.exit( -1 )  

  ed = EvnDispApp()
  ed.setSoftwarePackage(EvnDispPack)

########## Use of trg mask file #######################
  usetrgfile = sys.argv[7]
  DIRAC.gLogger.notice( 'Usetrgfile:', usetrgfile )

####### Use of multiple inputs per job ###
  simtelFileLFNList = sys.argv[-1].split('ParametricParameters={')[1].split('}')[0].replace(',',' ')
  # first element of the list
  simtelFileLFN = simtelFileLFNList.split(' ')[0]  
  ## convert the string into a list and get the basename
  simtelFileList = []
  for word in simtelFileLFNList.split():
    simtelFileList.append(os.path.basename(word))

####  Parse the Layout List #################
  layoutList = parseLayoutList(sys.argv[9])
#############################################

####  Loop over the Layout List #################
  for layout in layoutList: 
    args = []
########## download trg mask file #######################
    if usetrgfile == 'True':
      trgmaskFileLFN = simtelFileLFN.replace( 'simtel.gz', 'trgmask.gz' )
      DIRAC.gLogger.notice( 'Trying to download the trgmask File', trgmaskFileLFN )
      result = DataManager().getFile( trgmaskFileLFN )
      if not result['OK']:
        DIRAC.gLogger.error( 'Failed to download trgmakfile:', result )
        jobReport.setApplicationStatus( 'Trgmakfile download Error' )
        DIRAC.exit( -1 )
      args.extend( ['-t', os.path.basename( trgmaskFileLFN )] )
############################################################
###### execute evndisplay converter ##################
    executable = sys.argv[5]

############ dst file Name ############################
    run_number = simtelFileList[-1].split( 'run' )[1].split( '.simtel.gz' )[0]
    runNum = int( run_number )
    subRunNumber = '%06d' % runNum
    particle = simtelFileList[-1].split( '_' )[0]
    if 'ptsrc' in simtelFileList[-1]:
      particle = particle + '_' + 'ptsrc'
    dstfile = particle + '_run' + subRunNumber + '_' + str( jobID ) + '_' + os.path.basename( layout ) + '_dst.root'
###########################################

    logfileName = executable + '_' + layout + '.log'
    layout = os.path.join( 'EVNDISP.CTA.runparameter/DetectorGeometry', layout )
    DIRAC.gLogger.notice( 'Layout is:', layout )

  # add other arguments for evndisplay converter specified by user ######
    converterparfile = open( 'converter.par', 'r' ).readlines()
    for line in converterparfile:
      for word in line.split():
        args.append( word )
#########################################################
    args.extend( ['-a', layout] )
    args.extend( ['-o', dstfile] )
    args.extend( simtelFileList )
    execute_module( ed, executable, args )
########### check existence of DST file ###############
    if not os.path.isfile( dstfile ):
      DIRAC.gLogger.error( 'DST file Missing:', dstfile )
      jobReport.setApplicationStatus( 'DST file Missing' )
      DIRAC.exit( -1 )

########### quality check on Log #############################################
    cmd = 'mv ' + executable + '.log' + ' ' + logfileName
    if( os.system( cmd ) ):
      DIRAC.exit( -1 )

    fd = open( 'check_log.sh', 'w' )
    fd.write( """#! /bin/sh
MCevts=$(grep writing  %s | grep "MC events" | awk '{print $2}')
if [ $MCevts -gt 0 ]; then
    exit 0
else
    echo "MCevts is zero"
    exit -1
fi
""" % (logfileName))
    fd.close()

    os.system( 'chmod u+x check_log.sh' )
    cmd = './check_log.sh'
    DIRAC.gLogger.notice( 'Executing system call:', cmd )
    if( os.system( cmd ) ):
      jobReport.setApplicationStatus( 'Converter Log Check Failed' )
      DIRAC.exit( -1 )

   ####  Check the mode #################
    mode = sys.argv[11]
    if( mode == 'convert_standalone' ):
      #DIRAC.exit()
      continue

###### execute evndisplay stage1 ###############
    executable = 'evndisp'
    logfileName = executable + '_' + os.path.basename( layout ) + '.log'

    args = ['-sourcefile', dstfile, '-outputdirectory', 'outdir']
  # add other arguments for evndisp specified by user ######
    evndispparfile = open( 'evndisp.par', 'r' ).readlines()
    for line in evndispparfile:
      for word in line.split():
        args.append( word )

    execute_module( ed, executable, args )

    for name in glob.glob( 'outdir/*.root' ):
      evndispOutFile = name.split( '.root' )[0] + '_' + str( jobID ) + '_' + os.path.basename( layout ) + '_evndisp.root'
      cmd = 'mv ' + name + ' ' + os.path.basename( evndispOutFile )
      if( os.system( cmd ) ):
        DIRAC.exit( -1 )

########### quality check on Log #############################################
    cmd = 'mv ' + executable + '.log' + ' ' + logfileName
    if( os.system( cmd ) ):
      DIRAC.exit( -1 )
    fd = open( 'check_log.sh', 'w' )
    fd.write( """#! /bin/sh
if grep -i "error" %s; then
exit 1
fi
if grep "Final checks on result file (seems to be OK):" %s; then
exit 0
else
exit 1
fi
""" % (logfileName,logfileName))
    fd.close()

    os.system( 'chmod u+x check_log.sh' )
    cmd = './check_log.sh'
    DIRAC.gLogger.notice( 'Executing system call:', cmd )
    if( os.system( cmd ) ):
      jobReport.setApplicationStatus( 'EvnDisp Log Check Failed' )
      DIRAC.exit( -1 )
##################################################################
########### remove the converted dst file #############################################
    cmd = 'rm ' + dstfile
    if( os.system( cmd ) ):
      DIRAC.exit( -1 )
 
  DIRAC.exit()