Example #1
0
    def execute(self):
        """The JobAgent execution method."""

        # Temporary mechanism to pass a shutdown message to the agent
        if os.path.exists("/var/lib/dirac_drain"):
            return self._finish("Node is being drained by an operator")

        self.log.verbose("Job Agent execution loop")

        # Check that there is enough slots to match a job
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"]:
            return self._finish(result["Message"])
        if result["OK"] and result["Value"]:
            return result

        # Check that we are allowed to continue and that time left is sufficient
        if self.jobCount:
            cpuWorkLeft = self._computeCPUWorkLeft()
            result = self._checkCPUWorkLeft(cpuWorkLeft)
            if not result["OK"]:
                return result
            result = self._setCPUWorkLeft(cpuWorkLeft)
            if not result["OK"]:
                return result

        # Get environment details and enhance them
        result = self._getCEDict(self.computingElement)
        if not result["OK"]:
            return result
        ceDictList = result["Value"]

        for ceDict in ceDictList:
            self._setCEDict(ceDict)

        # Try to match a job
        jobRequest = self._matchAJob(ceDictList)

        self.stopAfterFailedMatches = self.am_getOption(
            "StopAfterFailedMatches", self.stopAfterFailedMatches)
        if not jobRequest["OK"]:
            res = self._checkMatchingIssues(jobRequest)
            if not res["OK"]:
                self._finish(res["Message"])
                return res

            # if we don't match a job, independently from the reason,
            # we wait a bit longer before trying again
            time.sleep(
                int(self.am_getOption("PollingTime")) *
                (self.matchFailedCount + 1) * 2)
            return res

        # If we are, we matched a job
        # Reset the Counter
        self.matchFailedCount = 0

        # Check matcher information returned
        matcherParams = ["JDL", "DN", "Group"]
        matcherInfo = jobRequest["Value"]
        jobID = matcherInfo["JobID"]
        jobReport = JobReport(jobID, "JobAgent@%s" % self.siteName)
        result = self._checkMatcherInfo(matcherInfo, matcherParams, jobReport)
        if not result["OK"]:
            return self._finish(result["Message"])

        # Get matcher information
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                "PilotInfoReportedFlag", False)

        jobJDL = matcherInfo["JDL"]
        jobGroup = matcherInfo["Group"]
        ownerDN = matcherInfo["DN"]
        ceDict = matcherInfo["CEDict"]
        matchTime = matcherInfo["matchTime"]

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        # Get JDL paramters
        parameters = self._getJDLParameters(jobJDL)
        if not parameters["OK"]:
            jobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus="Could Not Extract JDL Parameters")
            self.log.warn("Could Not Extract JDL Parameters",
                          parameters["Message"])
            return self._finish("JDL Problem")

        params = parameters["Value"]
        result = self._extractValuesFromJobParams(params, jobReport)
        if not result["OK"]:
            return self._finish(result["Value"])
        submissionParams = result["Value"]
        jobID = submissionParams["jobID"]
        jobType = submissionParams["jobType"]

        self.log.verbose("Job request successful: \n", jobRequest["Value"])
        self.log.info(
            "Received", "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport.setJobParameter(par_name="MatcherServiceTime",
                                      par_value=str(matchTime),
                                      sendFlag=False)

            if "BOINC_JOB_ID" in os.environ:
                # Report BOINC environment
                for thisp in ("BoincUserID", "BoincHostID",
                              "BoincHostPlatform", "BoincHostName"):
                    jobReport.setJobParameter(par_name=thisp,
                                              par_value=gConfig.getValue(
                                                  "/LocalSite/%s" % thisp,
                                                  "Unknown"),
                                              sendFlag=False)

            jobReport.setJobStatus(minorStatus="Job Received by Agent",
                                   sendFlag=False)
            result_setupProxy = self._setupProxy(ownerDN, jobGroup)
            if not result_setupProxy["OK"]:
                result = self._rescheduleFailedJob(
                    jobID, result_setupProxy["Message"])
                return self._finish(result["Message"],
                                    self.stopOnApplicationFailure)
            proxyChain = result_setupProxy.get("Value")

            # Save the job jdl for external monitoring
            self._saveJobJDLRequest(jobID, jobJDL)

            # Check software and install them if required
            software = self._checkInstallSoftware(jobID, params, ceDict,
                                                  jobReport)
            if not software["OK"]:
                self.log.error("Failed to install software for job",
                               "%s" % (jobID))
                errorMsg = software["Message"]
                if not errorMsg:
                    errorMsg = "Failed software installation"
                result = self._rescheduleFailedJob(jobID, errorMsg)
                return self._finish(result["Message"],
                                    self.stopOnApplicationFailure)

            gridCE = gConfig.getValue("/LocalSite/GridCE", "")
            if gridCE:
                jobReport.setJobParameter(par_name="GridCE",
                                          par_value=gridCE,
                                          sendFlag=False)

            queue = gConfig.getValue("/LocalSite/CEQueue", "")
            if queue:
                jobReport.setJobParameter(par_name="CEQueue",
                                          par_value=queue,
                                          sendFlag=False)

            self.log.debug("Before self._submitJob() (%sCE)" % (self.ceName))
            result_submitJob = self._submitJob(
                jobID=jobID,
                jobParams=params,
                resourceParams=ceDict,
                optimizerParams=optimizerParams,
                proxyChain=proxyChain,
                jobReport=jobReport,
                processors=submissionParams["processors"],
                wholeNode=submissionParams["wholeNode"],
                maxNumberOfProcessors=submissionParams[
                    "maxNumberOfProcessors"],
                mpTag=submissionParams["mpTag"],
            )

            # Committing the JobReport before evaluating the result of job submission
            res = jobReport.commit()
            if not res["OK"]:
                resFD = jobReport.generateForwardDISET()
                if not resFD["OK"]:
                    self.log.error("Error generating ForwardDISET operation",
                                   resFD["Message"])
                elif resFD["Value"]:
                    # Here we create the Request.
                    op = resFD["Value"]
                    request = Request()
                    requestName = "jobAgent_%s" % jobID
                    request.RequestName = requestName.replace('"', "")
                    request.JobID = jobID
                    request.SourceComponent = "JobAgent_%s" % jobID
                    request.addOperation(op)
                    # This might fail, but only a message would be printed.
                    self._sendFailoverRequest(request)

            if not result_submitJob["OK"]:
                return self._finish(result_submitJob["Message"])
            elif "PayloadFailed" in result_submitJob:
                # Do not keep running and do not overwrite the Payload error
                message = "Payload execution failed with error code %s" % result_submitJob[
                    "PayloadFailed"]
                if self.stopOnApplicationFailure:
                    return self._finish(message, self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug("After %sCE submitJob()" % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            result = self._rescheduleFailedJob(
                jobID, "Job processing failed with exception", direct=True)
            return self._finish(result["Message"],
                                self.stopOnApplicationFailure)

        return S_OK("Job Agent cycle complete")
Example #2
0
def execute(arguments):
  """ The only real function executed here
  """

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int(jobID)

  if 'WorkingDirectory' in arguments:
    wdir = os.path.expandvars(arguments['WorkingDirectory'])
    if os.path.isdir(wdir):
      os.chdir(wdir)
    else:
      try:
        os.makedirs(wdir)  # this will raise an exception if wdir already exists (which is ~OK)
        if os.path.isdir(wdir):
          os.chdir(wdir)
      except OSError as osError:
        if osError.errno == errno.EEXIST and os.path.isdir(wdir):
          gLogger.exception('JobWrapperTemplate found that the working directory already exists')
          rescheduleResult = rescheduleFailedJob(jobID, 'Working Directory already exists')
        else:
          gLogger.exception('JobWrapperTemplate could not create working directory')
          rescheduleResult = rescheduleFailedJob(jobID, 'Could Not Create Working Directory')
        return 1

  gJobReport = JobReport(jobID, 'JobWrapper')

  try:
    job = JobWrapper(jobID, gJobReport)
    job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper failed the initialization phase', lException=exc)
    rescheduleResult = rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport)
    try:
      job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization')
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper failed sending job accounting', lException=exc)
    return 1

  if 'InputSandbox' in arguments['Job']:
    gJobReport.commit()
    try:
      result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError:
      gLogger.exception('JobWrapper failed to download input sandbox')
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while downloading input sandbox', lException=exc)
      rescheduleResult = rescheduleFailedJob(jobID, 'Input Sandbox Download', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
      return 1
  else:
    gLogger.verbose('Job has no InputSandbox requirement')

  gJobReport.commit()

  if 'InputData' in arguments['Job']:
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn(result['Message'])
          raise JobWrapperError(result['Message'])
      except JobWrapperError:
        gLogger.exception('JobWrapper failed to resolve input data')
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
      except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception('JobWrapper raised exception while resolving input data', lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID, 'Input Data Resolution', gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Input Data Resolution')
        return 1
    else:
      gLogger.verbose('Job has a null InputData requirement:')
      gLogger.verbose(arguments)
  else:
    gLogger.verbose('Job has no InputData requirement')

  gJobReport.commit()

  try:
    result = job.execute(arguments)
    if not result['OK']:
      gLogger.error('Failed to execute job', result['Message'])
      raise JobWrapperError((result['Message'], result['Errno']))
  except JobWrapperError as exc:
    if exc.value[1] == 0 or str(exc.value[0]) == '0':
      gLogger.verbose('JobWrapper exited with status=0 after execution')
    if exc.value[1] == DErrno.EWMSRESC:
      gLogger.warn("Asked to reschedule job")
      rescheduleResult = rescheduleFailedJob(jobID, 'JobWrapper execution', gJobReport)
      job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
      return 1
    gLogger.exception('Job failed in execution phase')
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus(
        'Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('Job raised exception during execution phase', lException=exc)
    gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
    gJobReport.setJobStatus('Failed', 'Exception During Execution', sendFlag=False)
    job.sendFailoverRequest('Failed', 'Exception During Execution')
    return 1

  if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
    try:
      result = job.processJobOutputs()
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except JobWrapperError as exc:
      gLogger.exception('JobWrapper failed to process output files')
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
    except Exception as exc:  # pylint: disable=broad-except
      gLogger.exception('JobWrapper raised exception while processing output files', lException=exc)
      gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
      gJobReport.setJobStatus('Failed', 'Uploading Job Outputs', sendFlag=False)
      job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
      return 2
  else:
    gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize()
  except Exception as exc:  # pylint: disable=broad-except
    gLogger.exception('JobWrapper raised exception during the finalization phase', lException=exc)
    return 2
Example #3
0
def execute(arguments):
    """ The only real function executed here
  """

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if 'WorkingDirectory' in arguments:
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        'JobWrapperTemplate found that the working directory already exists'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Working Directory already exists')
                else:
                    gLogger.exception(
                        'JobWrapperTemplate could not create working directory'
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception('JobWrapper failed the initialization phase',
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        try:
            job.sendJobAccounting(rescheduleResult,
                                  'Job Wrapper Initialization')
        except Exception as exc:  #pylint: disable=broad-except
            gLogger.exception('JobWrapper failed sending job accounting',
                              lException=exc)
        return 1

    if 'InputSandbox' in arguments['Job']:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
        except Exception as exc:  #pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while downloading input sandbox',
                lException=exc)
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if 'InputData' in arguments['Job']:
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except JobWrapperError:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
            except Exception as exc:  #pylint: disable=broad-except
                gLogger.exception(
                    'JobWrapper raised exception while resolving input data',
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
    else:
        gLogger.verbose('Job has no InputData requirement')

    gJobReport.commit()

    try:
        result = job.execute(arguments)
        if not result['OK']:
            gLogger.error('Failed to execute job', result['Message'])
            raise JobWrapperError((result['Message'], result['Errno']))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'JobWrapper execution',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'JobWrapper execution')
            return 1
        gLogger.exception('Job failed in execution phase')
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception('Job raised exception during execution phase',
                          lException=exc)
        gJobReport.setJobParameter('Error Message', str(exc), sendFlag=False)
        gJobReport.setJobStatus('Failed',
                                'Exception During Execution',
                                sendFlag=False)
        job.sendFailoverRequest('Failed', 'Exception During Execution')
        return 1

    if 'OutputSandbox' in arguments['Job'] or 'OutputData' in arguments['Job']:
        try:
            result = job.processJobOutputs(arguments)
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except JobWrapperError as exc:
            gLogger.exception('JobWrapper failed to process output files')
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                'JobWrapper raised exception while processing output files',
                lException=exc)
            gJobReport.setJobParameter('Error Message',
                                       str(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
    else:
        gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

    try:
        # Failed jobs will return 1 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  #pylint: disable=broad-except
        gLogger.exception(
            'JobWrapper raised exception during the finalization phase',
            lException=exc)
        return 2
Example #4
0
def execute ( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )
  # Fix in the environment to get a reasonable performance from dCache,
  # until we move to a new version of root
#  os.environ['DCACHE_RAHEAD'] = str(1)
#  os.environ['DCACHE_RA_BUFFER'] = str(50*1024)

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  #root = arguments['CE']['Root']
  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the initialization phase' )
    rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    job.sendWMSAccounting( 'Failed', 'Job Wrapper Initialization' )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleFailedJob( jobID, 'Input Sandbox Download' )
      job.sendWMSAccounting( 'Failed', 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception, x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleFailedJob( jobID, 'Input Data Resolution' )
        job.sendWMSAccounting( 'Failed', 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
Example #5
0
def execute ( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the initialization phase' )
    rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport )
      job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception, x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport )
        job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
Example #6
0
    def execute(self):
        """The JobAgent execution method."""
        self.log.verbose("Job Agent execution loop")

        queueDictItems = list(self.queueDict.items())
        random.shuffle(queueDictItems)

        # Check that there is enough slots locally
        result = self._checkCEAvailability(self.computingElement)
        if not result["OK"] or result["Value"]:
            return result

        for queueName, queueDictionary in queueDictItems:

            # Make sure there is no problem with the queue before trying to submit
            if not self._allowedToSubmit(queueName):
                continue

            # Get a working proxy
            ce = queueDictionary["CE"]
            cpuTime = 86400 * 3
            self.log.verbose(
                "Getting pilot proxy",
                "for %s/%s %d long" % (self.pilotDN, self.pilotGroup, cpuTime))
            result = gProxyManager.getPilotProxyFromDIRACGroup(
                self.pilotDN, self.pilotGroup, cpuTime)
            if not result["OK"]:
                return result
            proxy = result["Value"]
            result = proxy.getRemainingSecs()  # pylint: disable=no-member
            if not result["OK"]:
                return result
            lifetime_secs = result["Value"]
            ce.setProxy(proxy, lifetime_secs)

            # Check that there is enough slots in the remote CE to match a job
            result = self._checkCEAvailability(ce)
            if not result["OK"] or result["Value"]:
                self.failedQueues[queueName] += 1
                continue

            # Get environment details and enhance them
            result = self._getCEDict(ce)
            if not result["OK"]:
                self.failedQueues[queueName] += 1
                continue
            ceDictList = result["Value"]

            for ceDict in ceDictList:
                # Information about number of processors might not be returned in CE.getCEStatus()
                ceDict["NumberOfProcessors"] = ce.ceParameters.get(
                    "NumberOfProcessors")
                self._setCEDict(ceDict)

            # Update the configuration with the names of the Site, CE and queue to target
            # This is used in the next stages
            self._updateConfiguration("Site", queueDictionary["Site"])
            self._updateConfiguration("GridCE", queueDictionary["CEName"])
            self._updateConfiguration("CEQueue", queueDictionary["QueueName"])
            self._updateConfiguration("RemoteExecution", True)

            # Try to match a job
            jobRequest = self._matchAJob(ceDictList)
            while jobRequest["OK"]:

                # Check matcher information returned
                matcherParams = ["JDL", "DN", "Group"]
                matcherInfo = jobRequest["Value"]
                jobID = matcherInfo["JobID"]
                jobReport = JobReport(jobID, "PushJobAgent@%s" % self.siteName)
                result = self._checkMatcherInfo(matcherInfo, matcherParams,
                                                jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break

                jobJDL = matcherInfo["JDL"]
                jobGroup = matcherInfo["Group"]
                ownerDN = matcherInfo["DN"]
                ceDict = matcherInfo["CEDict"]
                matchTime = matcherInfo["matchTime"]

                optimizerParams = {}
                for key in matcherInfo:
                    if key not in matcherParams:
                        optimizerParams[key] = matcherInfo[key]

                # Get JDL paramters
                parameters = self._getJDLParameters(jobJDL)
                if not parameters["OK"]:
                    jobReport.setJobStatus(
                        status=JobStatus.FAILED,
                        minorStatus="Could Not Extract JDL Parameters")
                    self.log.warn("Could Not Extract JDL Parameters",
                                  parameters["Message"])
                    self.failedQueues[queueName] += 1
                    break

                params = parameters["Value"]
                result = self._extractValuesFromJobParams(params, jobReport)
                if not result["OK"]:
                    self.failedQueues[queueName] += 1
                    break
                submissionParams = result["Value"]
                jobID = submissionParams["jobID"]
                jobType = submissionParams["jobType"]

                self.log.verbose("Job request successful: \n",
                                 jobRequest["Value"])
                self.log.info(
                    "Received",
                    "JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s" %
                    (jobID, jobType, ownerDN, jobGroup))
                try:
                    jobReport.setJobParameter(par_name="MatcherServiceTime",
                                              par_value=str(matchTime),
                                              sendFlag=False)
                    jobReport.setJobStatus(status=JobStatus.MATCHED,
                                           minorStatus="Job Received by Agent",
                                           sendFlag=False)

                    # Setup proxy
                    result_setupProxy = self._setupProxy(ownerDN, jobGroup)
                    if not result_setupProxy["OK"]:
                        result = self._rescheduleFailedJob(
                            jobID, result_setupProxy["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    proxyChain = result_setupProxy.get("Value")

                    # Check software and install them if required
                    software = self._checkInstallSoftware(
                        jobID, params, ceDict, jobReport)
                    if not software["OK"]:
                        self.log.error("Failed to install software for job",
                                       "%s" % (jobID))
                        errorMsg = software["Message"]
                        if not errorMsg:
                            errorMsg = "Failed software installation"
                        result = self._rescheduleFailedJob(jobID, errorMsg)
                        self.failedQueues[queueName] += 1
                        break

                    # Submit the job to the CE
                    self.log.debug("Before self._submitJob() (%sCE)" %
                                   (self.ceName))
                    result_submitJob = self._submitJob(
                        jobID=jobID,
                        jobParams=params,
                        resourceParams=ceDict,
                        optimizerParams=optimizerParams,
                        proxyChain=proxyChain,
                        jobReport=jobReport,
                        processors=submissionParams["processors"],
                        wholeNode=submissionParams["wholeNode"],
                        maxNumberOfProcessors=submissionParams[
                            "maxNumberOfProcessors"],
                        mpTag=submissionParams["mpTag"],
                    )

                    # Committing the JobReport before evaluating the result of job submission
                    res = jobReport.commit()
                    if not res["OK"]:
                        resFD = jobReport.generateForwardDISET()
                        if not resFD["OK"]:
                            self.log.error(
                                "Error generating ForwardDISET operation",
                                resFD["Message"])
                        elif resFD["Value"]:
                            # Here we create the Request.
                            op = resFD["Value"]
                            request = Request()
                            requestName = "jobAgent_%s" % jobID
                            request.RequestName = requestName.replace('"', "")
                            request.JobID = jobID
                            request.SourceComponent = "JobAgent_%s" % jobID
                            request.addOperation(op)
                            # This might fail, but only a message would be printed.
                            self._sendFailoverRequest(request)

                    if not result_submitJob["OK"]:
                        self.log.error("Error during submission",
                                       result_submitJob["Message"])
                        self.failedQueues[queueName] += 1
                        break
                    elif "PayloadFailed" in result_submitJob:
                        # Do not keep running and do not overwrite the Payload error
                        message = "Payload execution failed with error code %s" % result_submitJob[
                            "PayloadFailed"]
                        self.log.info(message)

                    self.log.debug("After %sCE submitJob()" % (self.ceName))

                    # Check that there is enough slots locally
                    result = self._checkCEAvailability(self.computingElement)
                    if not result["OK"] or result["Value"]:
                        return result

                    # Check that there is enough slots in the remote CE to match a new job
                    result = self._checkCEAvailability(ce)
                    if not result["OK"] or result["Value"]:
                        self.failedQueues[queueName] += 1
                        break

                    # Try to match a new job
                    jobRequest = self._matchAJob(ceDictList)
                except Exception as subExcept:  # pylint: disable=broad-except
                    self.log.exception("Exception in submission",
                                       "",
                                       lException=subExcept,
                                       lExcInfo=True)
                    result = self._rescheduleFailedJob(
                        jobID, "Job processing failed with exception")
                    self.failedQueues[queueName] += 1
                    break

            if not jobRequest["OK"]:
                self._checkMatchingIssues(jobRequest)
                self.failedQueues[queueName] += 1
                continue

        return S_OK("Push Job Agent cycle complete")
Example #7
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)
    # Fix in the environment to get a reasonable performance from dCache,
    # until we move to a new version of root
    #  os.environ['DCACHE_RAHEAD'] = str(1)
    #  os.environ['DCACHE_RA_BUFFER'] = str(50*1024)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleFailedJob(jobID,
                                    'Could Not Create Working Directory')
                return 1

    #root = arguments['CE']['Root']
    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleFailedJob(jobID, 'Job Wrapper Initialization', gJobReport)
        job.sendWMSAccounting('Failed', 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleFailedJob(jobID, 'Input Sandbox Download')
            job.sendWMSAccounting('Failed', 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception, x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleFailedJob(jobID, 'Input Data Resolution')
                job.sendWMSAccounting('Failed', 'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
Example #8
0
    def execute(self):
        """The JobAgent execution method.
    """

        # Temporary mechanism to pass a shutdown message to the agent
        if os.path.exists('/var/lib/dirac_drain'):
            return self.__finish('Node is being drained by an operator')

        # Check if we can match jobs at all
        self.log.verbose('Job Agent execution loop')
        result = self.computingElement.available()
        if not result['OK']:
            self.log.info('Resource is not available', result['Message'])
            return self.__finish('CE Not Available')

        ceInfoDict = result['CEInfoDict']
        runningJobs = ceInfoDict.get("RunningJobs")
        availableSlots = result['Value']

        if not availableSlots:
            if runningJobs:
                self.log.info('No available slots',
                              ': %d running jobs' % runningJobs)
                return S_OK('Job Agent cycle complete with %d running jobs' %
                            runningJobs)
            self.log.info(
                'CE is not available (and there are no running jobs)')
            return self.__finish('CE Not Available')

        if self.jobCount:
            # Only call timeLeft utility after a job has been picked up
            self.log.info('Attempting to check CPU time left for filling mode')
            if self.fillingMode:
                self.timeLeft = self.computeCPUWorkLeft()
                self.log.info('normalized CPU units remaining in slot',
                              self.timeLeft)
                if self.timeLeft <= self.minimumTimeLeft:
                    return self.__finish('No more time left')
                # Need to update the Configuration so that the new value is published in the next matching request
                result = self.computingElement.setCPUTimeLeft(
                    cpuTimeLeft=self.timeLeft)
                if not result['OK']:
                    return self.__finish(result['Message'])

                # Update local configuration to be used by submitted job wrappers
                localCfg = CFG()
                if self.extraOptions:
                    localConfigFile = os.path.join('.', self.extraOptions)
                else:
                    localConfigFile = os.path.join(rootPath, "etc",
                                                   "dirac.cfg")
                localCfg.loadFromFile(localConfigFile)
                if not localCfg.isSection('/LocalSite'):
                    localCfg.createNewSection('/LocalSite')
                localCfg.setOption('/LocalSite/CPUTimeLeft', self.timeLeft)
                localCfg.writeToFile(localConfigFile)

            else:
                return self.__finish('Filling Mode is Disabled')

        # if we are here we assume that a job can be matched
        result = self.computingElement.getDescription()
        if not result['OK']:
            return result

        # We can have several prioritized job retrieval strategies
        if isinstance(result['Value'], dict):
            ceDictList = [result['Value']]
        elif isinstance(result['Value'], list):
            # This is the case for Pool ComputingElement, and parameter 'MultiProcessorStrategy'
            ceDictList = result['Value']

        for ceDict in ceDictList:

            # Add pilot information
            gridCE = gConfig.getValue('LocalSite/GridCE', 'Unknown')
            if gridCE != 'Unknown':
                ceDict['GridCE'] = gridCE
            if 'PilotReference' not in ceDict:
                ceDict['PilotReference'] = str(self.pilotReference)
            ceDict['PilotBenchmark'] = self.cpuFactor
            ceDict['PilotInfoReportedFlag'] = self.pilotInfoReportedFlag

            # Add possible job requirements
            result = gConfig.getOptionsDict('/AgentJobRequirements')
            if result['OK']:
                requirementsDict = result['Value']
                ceDict.update(requirementsDict)
                self.log.info('Requirements:', requirementsDict)

            self.log.verbose('CE dict', ceDict)

            # here finally calling the matcher
            start = time.time()
            jobRequest = MatcherClient().requestJob(ceDict)
            matchTime = time.time() - start
            self.log.info('MatcherTime', '= %.2f (s)' % (matchTime))
            if jobRequest['OK']:
                break

        self.stopAfterFailedMatches = self.am_getOption(
            'StopAfterFailedMatches', self.stopAfterFailedMatches)

        if not jobRequest['OK']:

            # if we don't match a job, independently from the reason,
            # we wait a bit longer before trying again
            self.am_setOption("PollingTime",
                              int(self.am_getOption("PollingTime") * 1.5))

            if re.search('No match found', jobRequest['Message']):
                self.log.notice('Job request OK, but no match found',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find("seconds timeout") != -1:
                self.log.error('Timeout while requesting job',
                               jobRequest['Message'])
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])
            elif jobRequest['Message'].find(
                    "Pilot version does not match") != -1:
                errorMsg = 'Pilot version does not match the production version'
                self.log.error(errorMsg,
                               jobRequest['Message'].replace(errorMsg, ''))
                return S_ERROR(jobRequest['Message'])
            else:
                self.log.notice('Failed to get jobs',
                                ': %s' % (jobRequest['Message']))
                self.matchFailedCount += 1
                if self.matchFailedCount > self.stopAfterFailedMatches:
                    return self.__finish(
                        'Nothing to do for more than %d cycles' %
                        self.stopAfterFailedMatches)
                return S_OK(jobRequest['Message'])

        # Reset the Counter
        self.matchFailedCount = 0

        # If we are here it is because we matched a job
        matcherInfo = jobRequest['Value']
        if not self.pilotInfoReportedFlag:
            # Check the flag after the first access to the Matcher
            self.pilotInfoReportedFlag = matcherInfo.get(
                'PilotInfoReportedFlag', False)
        jobID = matcherInfo['JobID']
        jobReport = JobReport(jobID, 'JobAgent@%s' % self.siteName)
        matcherParams = ['JDL', 'DN', 'Group']
        for param in matcherParams:
            if param not in matcherInfo:
                jobReport.setJobStatus(status='Failed',
                                       minor='Matcher did not return %s' %
                                       (param))
                return self.__finish('Matcher Failed')
            elif not matcherInfo[param]:
                jobReport.setJobStatus(status='Failed',
                                       minor='Matcher returned null %s' %
                                       (param))
                return self.__finish('Matcher Failed')
            else:
                self.log.verbose('Matcher returned',
                                 '%s = %s ' % (param, matcherInfo[param]))

        jobJDL = matcherInfo['JDL']
        jobGroup = matcherInfo['Group']
        ownerDN = matcherInfo['DN']

        optimizerParams = {}
        for key in matcherInfo:
            if key not in matcherParams:
                optimizerParams[key] = matcherInfo[key]

        parameters = self._getJDLParameters(jobJDL)
        if not parameters['OK']:
            jobReport.setJobStatus(status='Failed',
                                   minor='Could Not Extract JDL Parameters')
            self.log.warn('Could Not Extract JDL Parameters',
                          parameters['Message'])
            return self.__finish('JDL Problem')

        params = parameters['Value']
        if 'JobID' not in params:
            msg = 'Job has not JobID defined in JDL parameters'
            jobReport.setJobStatus(status='Failed', minor=msg)
            self.log.warn(msg)
            return self.__finish('JDL Problem')
        else:
            jobID = params['JobID']

        if 'JobType' not in params:
            self.log.warn('Job has no JobType defined in JDL parameters')
            jobType = 'Unknown'
        else:
            jobType = params['JobType']

        if 'CPUTime' not in params:
            self.log.warn(
                'Job has no CPU requirement defined in JDL parameters')

        # Job requirements for determining the number of processors
        # the minimum number of processors requested
        processors = int(
            params.get('NumberOfProcessors',
                       int(params.get('MinNumberOfProcessors', 1))))
        # the maximum number of processors allowed to the payload
        maxNumberOfProcessors = int(params.get('MaxNumberOfProcessors', 0))
        # need or not the whole node for the job
        wholeNode = 'WholeNode' in params
        mpTag = 'MultiProcessor' in params.get('Tags', [])

        if self.extraOptions and 'dirac-jobexec' in params.get(
                'Executable', '').strip():
            params['Arguments'] = (params.get('Arguments', '') + ' ' +
                                   self.extraOptions).strip()
            params['ExtraOptions'] = self.extraOptions

        self.log.verbose('Job request successful: \n', jobRequest['Value'])
        self.log.info(
            'Received', 'JobID=%s, JobType=%s, OwnerDN=%s, JobGroup=%s' %
            (jobID, jobType, ownerDN, jobGroup))
        self.jobCount += 1
        try:
            jobReport.setJobParameter(par_name='MatcherServiceTime',
                                      par_value=str(matchTime),
                                      sendFlag=False)

            if 'BOINC_JOB_ID' in os.environ:
                # Report BOINC environment
                for thisp in ('BoincUserID', 'BoincHostID',
                              'BoincHostPlatform', 'BoincHostName'):
                    jobReport.setJobParameter(par_name=thisp,
                                              par_value=gConfig.getValue(
                                                  '/LocalSite/%s' % thisp,
                                                  'Unknown'),
                                              sendFlag=False)

            jobReport.setJobStatus(status='Matched',
                                   minor='Job Received by Agent',
                                   sendFlag=False)
            result_setupProxy = self._setupProxy(ownerDN, jobGroup)
            if not result_setupProxy['OK']:
                return self._rescheduleFailedJob(jobID,
                                                 result_setupProxy['Message'],
                                                 self.stopOnApplicationFailure)
            proxyChain = result_setupProxy.get('Value')

            # Save the job jdl for external monitoring
            self.__saveJobJDLRequest(jobID, jobJDL)

            software = self._checkInstallSoftware(jobID, params, ceDict,
                                                  jobReport)
            if not software['OK']:
                self.log.error('Failed to install software for job',
                               '%s' % (jobID))
                errorMsg = software['Message']
                if not errorMsg:
                    errorMsg = 'Failed software installation'
                return self._rescheduleFailedJob(jobID, errorMsg,
                                                 self.stopOnApplicationFailure)

            self.log.debug('Before self._submitJob() (%sCE)' % (self.ceName))
            result_submitJob = self._submitJob(
                jobID=jobID,
                jobParams=params,
                resourceParams=ceDict,
                optimizerParams=optimizerParams,
                proxyChain=proxyChain,
                jobReport=jobReport,
                processors=processors,
                wholeNode=wholeNode,
                maxNumberOfProcessors=maxNumberOfProcessors,
                mpTag=mpTag)

            # Committing the JobReport before evaluating the result of job submission
            res = jobReport.commit()
            if not res['OK']:
                resFD = jobReport.generateForwardDISET()
                if not resFD['OK']:
                    self.log.error("Error generating ForwardDISET operation",
                                   resFD['Message'])
                else:
                    # Here we create the Request.
                    op = resFD['Value']
                    request = Request()
                    requestName = 'jobAgent_%s' % jobID
                    request.RequestName = requestName.replace('"', '')
                    request.JobID = jobID
                    request.SourceComponent = "JobAgent_%s" % jobID
                    request.addOperation(op)
                    # This might fail, but only a message would be printed.
                    self._sendFailoverRequest(request)

            if not result_submitJob['OK']:
                return self.__finish(result_submitJob['Message'])
            elif 'PayloadFailed' in result_submitJob:
                # Do not keep running and do not overwrite the Payload error
                message = 'Payload execution failed with error code %s' % result_submitJob[
                    'PayloadFailed']
                if self.stopOnApplicationFailure:
                    return self.__finish(message,
                                         self.stopOnApplicationFailure)
                else:
                    self.log.info(message)

            self.log.debug('After %sCE submitJob()' % (self.ceName))
        except Exception as subExcept:  # pylint: disable=broad-except
            self.log.exception("Exception in submission",
                               "",
                               lException=subExcept,
                               lExcInfo=True)
            return self._rescheduleFailedJob(
                jobID, 'Job processing failed with exception',
                self.stopOnApplicationFailure)

        return S_OK('Job Agent cycle complete')
Example #9
0
def execute(arguments):
    """The only real function executed here"""

    global gJobReport

    jobID = arguments["Job"].get("JobID", 0)
    os.environ["JOBID"] = str(jobID)
    jobID = int(jobID)

    if "WorkingDirectory" in arguments:
        wdir = os.path.expandvars(arguments["WorkingDirectory"])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(
                    wdir
                )  # this will raise an exception if wdir already exists (which is ~OK)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except OSError as osError:
                if osError.errno == errno.EEXIST and os.path.isdir(wdir):
                    gLogger.exception(
                        "JobWrapperTemplate found that the working directory already exists"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Working Directory already exists")
                else:
                    gLogger.exception(
                        "JobWrapperTemplate could not create working directory"
                    )
                    rescheduleResult = rescheduleFailedJob(
                        jobID, "Could Not Create Working Directory")
                return 1

    gJobReport = JobReport(jobID, "JobWrapper")

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)  # initialize doesn't return S_OK/S_ERROR
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("JobWrapper failed the initialization phase",
                          lException=exc)
        rescheduleResult = rescheduleFailedJob(
            jobID=jobID,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION,
            jobReport=gJobReport)
        job.sendJobAccounting(
            status=rescheduleResult,
            minorStatus=JobMinorStatus.JOB_WRAPPER_INITIALIZATION)
        return 1

    if "InputSandbox" in arguments["Job"]:
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments["Job"]["InputSandbox"])
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError:
            gLogger.exception("JobWrapper failed to download input sandbox")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while downloading input sandbox",
                lException=exc)
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.DOWNLOADING_INPUT_SANDBOX)
            return 1
    else:
        gLogger.verbose("Job has no InputSandbox requirement")

    gJobReport.commit()

    if "InputData" in arguments["Job"]:
        if arguments["Job"]["InputData"]:
            try:
                result = job.resolveInputData()
                if not result["OK"]:
                    gLogger.warn(result["Message"])
                    raise JobWrapperError(result["Message"])
            except JobWrapperError:
                gLogger.exception("JobWrapper failed to resolve input data")
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
            except Exception as exc:  # pylint: disable=broad-except
                gLogger.exception(
                    "JobWrapper raised exception while resolving input data",
                    lException=exc)
                rescheduleResult = rescheduleFailedJob(
                    jobID=jobID,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION,
                    jobReport=gJobReport)
                job.sendJobAccounting(
                    status=rescheduleResult,
                    minorStatus=JobMinorStatus.INPUT_DATA_RESOLUTION)
                return 1
        else:
            gLogger.verbose("Job has a null InputData requirement:")
            gLogger.verbose(arguments)
    else:
        gLogger.verbose("Job has no InputData requirement")

    gJobReport.commit()

    try:
        result = job.execute()
        if not result["OK"]:
            gLogger.error("Failed to execute job", result["Message"])
            raise JobWrapperError((result["Message"], result["Errno"]))
    except JobWrapperError as exc:
        if exc.value[1] == 0 or str(exc.value[0]) == "0":
            gLogger.verbose("JobWrapper exited with status=0 after execution")
        if exc.value[1] == DErrno.EWMSRESC:
            gLogger.warn("Asked to reschedule job")
            rescheduleResult = rescheduleFailedJob(
                jobID=jobID,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION,
                jobReport=gJobReport)
            job.sendJobAccounting(
                status=rescheduleResult,
                minorStatus=JobMinorStatus.JOB_WRAPPER_EXECUTION)
            return 1
        gLogger.exception("Job failed in execution phase")
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception("Job raised exception during execution phase",
                          lException=exc)
        gJobReport.setJobParameter("Error Message", repr(exc), sendFlag=False)
        gJobReport.setJobStatus(
            status=JobStatus.FAILED,
            minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC,
            sendFlag=False)
        job.sendFailoverRequest()
        job.sendJobAccounting(status=JobStatus.FAILED,
                              minorStatus=JobMinorStatus.EXCEPTION_DURING_EXEC)
        return 1

    if "OutputSandbox" in arguments["Job"] or "OutputData" in arguments["Job"]:
        try:
            result = job.processJobOutputs()
            if not result["OK"]:
                gLogger.warn(result["Message"])
                raise JobWrapperError(result["Message"])
        except JobWrapperError as exc:
            gLogger.exception("JobWrapper failed to process output files")
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)

            return 2
        except Exception as exc:  # pylint: disable=broad-except
            gLogger.exception(
                "JobWrapper raised exception while processing output files",
                lException=exc)
            gJobReport.setJobParameter("Error Message",
                                       repr(exc),
                                       sendFlag=False)
            gJobReport.setJobStatus(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS,
                sendFlag=False)
            job.sendFailoverRequest()
            job.sendJobAccounting(
                status=JobStatus.FAILED,
                minorStatus=JobMinorStatus.UPLOADING_JOB_OUTPUTS)
            return 2
    else:
        gLogger.verbose("Job has no OutputData or OutputSandbox requirement")

    try:
        # Failed jobs will return !=0 / successful jobs will return 0
        return job.finalize()
    except Exception as exc:  # pylint: disable=broad-except
        gLogger.exception(
            "JobWrapper raised exception during the finalization phase",
            lException=exc)
        return 2
Example #10
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception as e:
        gLogger.exception('JobWrapper failed the initialization phase',
                          lException=e)
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        try:
            job.sendJobAccounting(rescheduleResult,
                                  'Job Wrapper Initialization')
        except Exception as e:
            gLogger.exception('JobWrapper failed sending job accounting',
                              lException=e)
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception as x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)
    else:
        gLogger.verbose('Job has no InputData requirement')

    gJobReport.commit()

    try:
        result = job.execute(arguments)
        if not result['OK']:
            gLogger.error('Failed to execute job', result['Message'])
            raise JobWrapperError(result['Message'])
    except Exception as x:
        if str(x) == '0':
            gLogger.verbose('JobWrapper exited with status=0 after execution')
        else:
            gLogger.exception('Job failed in execution phase')
            gJobReport.setJobParameter('Error Message', str(x), sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Exception During Execution',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Exception During Execution')
            return 1

    if arguments['Job'].has_key('OutputSandbox') or arguments['Job'].has_key(
            'OutputData'):
        try:
            result = job.processJobOutputs(arguments)
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception as x:
            gLogger.exception('JobWrapper failed to process output files')
            gJobReport.setJobParameter('Error Message', str(x), sendFlag=False)
            gJobReport.setJobStatus('Failed',
                                    'Uploading Job Outputs',
                                    sendFlag=False)
            job.sendFailoverRequest('Failed', 'Uploading Job Outputs')
            return 2
    else:
        gLogger.verbose('Job has no OutputData or OutputSandbox requirement')

    try:
        # Failed jobs will return 1 / successful jobs will return 0
        return job.finalize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the finalization phase')
        return 2
Example #11
0
        return 1

  #root = arguments['CE']['Root']
  jobReport = JobReport(jobID,'JobWrapper')

  try:
    job = JobWrapper( jobID, jobReport )
    job.initialize(arguments)
  except Exception, x:
    gLogger.exception('JobWrapper failed the initialization phase')
    rescheduleFailedJob(jobID,'Job Wrapper Initialization')
    job.sendWMSAccounting('Failed','Job Wrapper Initialization')
    return 1

  if arguments['Job'].has_key('InputSandbox'):
    jobReport.commit()
    try:
      result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
      if not result['OK']:
        gLogger.warn(result['Message'])
        raise JobWrapperError(result['Message'])
    except Exception, x:
      gLogger.exception('JobWrapper failed to download input sandbox')
      rescheduleFailedJob(jobID,'Input Sandbox Download')
      job.sendWMSAccounting('Failed','Input Sandbox Download')
      return 1
  else:
    gLogger.verbose('Job has no InputSandbox requirement')

  jobReport.commit()
def execute( arguments ):

  global gJobReport

  jobID = arguments['Job']['JobID']
  os.environ['JOBID'] = jobID
  jobID = int( jobID )

  if arguments.has_key( 'WorkingDirectory' ):
    wdir = os.path.expandvars( arguments['WorkingDirectory'] )
    if os.path.isdir( wdir ):
      os.chdir( wdir )
    else:
      try:
        os.makedirs( wdir )
        if os.path.isdir( wdir ):
          os.chdir( wdir )
      except Exception:
        gLogger.exception( 'JobWrapperTemplate could not create working directory' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Could Not Create Working Directory' )
        return 1

  gJobReport = JobReport( jobID, 'JobWrapper' )

  try:
    job = JobWrapper( jobID, gJobReport )
    job.initialize( arguments )
  except Exception as e:
    gLogger.exception( 'JobWrapper failed the initialization phase', lException = e )
    rescheduleResult = rescheduleFailedJob( jobID, 'Job Wrapper Initialization', gJobReport )
    try:
      job.sendJobAccounting( rescheduleResult, 'Job Wrapper Initialization' )
    except Exception as e:
      gLogger.exception( 'JobWrapper failed sending job accounting', lException = e )
    return 1

  if arguments['Job'].has_key( 'InputSandbox' ):
    gJobReport.commit()
    try:
      result = job.transferInputSandbox( arguments['Job']['InputSandbox'] )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception:
      gLogger.exception( 'JobWrapper failed to download input sandbox' )
      rescheduleResult = rescheduleFailedJob( jobID, 'Input Sandbox Download', gJobReport )
      job.sendJobAccounting( rescheduleResult, 'Input Sandbox Download' )
      return 1
  else:
    gLogger.verbose( 'Job has no InputSandbox requirement' )

  gJobReport.commit()

  if arguments['Job'].has_key( 'InputData' ):
    if arguments['Job']['InputData']:
      try:
        result = job.resolveInputData()
        if not result['OK']:
          gLogger.warn( result['Message'] )
          raise JobWrapperError( result['Message'] )
      except Exception as x:
        gLogger.exception( 'JobWrapper failed to resolve input data' )
        rescheduleResult = rescheduleFailedJob( jobID, 'Input Data Resolution', gJobReport )
        job.sendJobAccounting( rescheduleResult, 'Input Data Resolution' )
        return 1
    else:
      gLogger.verbose( 'Job has a null InputData requirement:' )
      gLogger.verbose( arguments )
  else:
    gLogger.verbose( 'Job has no InputData requirement' )

  gJobReport.commit()

  try:
    result = job.execute( arguments )
    if not result['OK']:
      gLogger.error( 'Failed to execute job', result['Message'] )
      raise JobWrapperError( result['Message'] )
  except Exception as x:
    if str( x ) == '0':
      gLogger.verbose( 'JobWrapper exited with status=0 after execution' )
    else:
      gLogger.exception( 'Job failed in execution phase' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Exception During Execution', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Exception During Execution' )
      return 1

  if arguments['Job'].has_key( 'OutputSandbox' ) or arguments['Job'].has_key( 'OutputData' ):
    try:
      result = job.processJobOutputs( arguments )
      if not result['OK']:
        gLogger.warn( result['Message'] )
        raise JobWrapperError( result['Message'] )
    except Exception as x:
      gLogger.exception( 'JobWrapper failed to process output files' )
      gJobReport.setJobParameter( 'Error Message', str( x ), sendFlag = False )
      gJobReport.setJobStatus( 'Failed', 'Uploading Job Outputs', sendFlag = False )
      job.sendFailoverRequest( 'Failed', 'Uploading Job Outputs' )
      return 2
  else:
    gLogger.verbose( 'Job has no OutputData or OutputSandbox requirement' )

  try:
    # Failed jobs will return 1 / successful jobs will return 0
    return job.finalize( arguments )
  except Exception:
    gLogger.exception( 'JobWrapper failed the finalization phase' )
    return 2
Example #13
0
                return 1

    #root = arguments['CE']['Root']
    jobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, jobReport)
        job.initialize(arguments)
    except Exception, x:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleFailedJob(jobID, 'Job Wrapper Initialization')
        job.sendWMSAccounting('Failed', 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        jobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception, x:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleFailedJob(jobID, 'Input Sandbox Download')
            job.sendWMSAccounting('Failed', 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    jobReport.commit()
Example #14
0
def execute(arguments):

    global gJobReport

    jobID = arguments['Job']['JobID']
    os.environ['JOBID'] = jobID
    jobID = int(jobID)

    if arguments.has_key('WorkingDirectory'):
        wdir = os.path.expandvars(arguments['WorkingDirectory'])
        if os.path.isdir(wdir):
            os.chdir(wdir)
        else:
            try:
                os.makedirs(wdir)
                if os.path.isdir(wdir):
                    os.chdir(wdir)
            except Exception:
                gLogger.exception(
                    'JobWrapperTemplate could not create working directory')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Could Not Create Working Directory')
                return 1

    gJobReport = JobReport(jobID, 'JobWrapper')

    try:
        job = JobWrapper(jobID, gJobReport)
        job.initialize(arguments)
    except Exception:
        gLogger.exception('JobWrapper failed the initialization phase')
        rescheduleResult = rescheduleFailedJob(jobID,
                                               'Job Wrapper Initialization',
                                               gJobReport)
        job.sendJobAccounting(rescheduleResult, 'Job Wrapper Initialization')
        return 1

    if arguments['Job'].has_key('InputSandbox'):
        gJobReport.commit()
        try:
            result = job.transferInputSandbox(arguments['Job']['InputSandbox'])
            if not result['OK']:
                gLogger.warn(result['Message'])
                raise JobWrapperError(result['Message'])
        except Exception:
            gLogger.exception('JobWrapper failed to download input sandbox')
            rescheduleResult = rescheduleFailedJob(jobID,
                                                   'Input Sandbox Download',
                                                   gJobReport)
            job.sendJobAccounting(rescheduleResult, 'Input Sandbox Download')
            return 1
    else:
        gLogger.verbose('Job has no InputSandbox requirement')

    gJobReport.commit()

    if arguments['Job'].has_key('InputData'):
        if arguments['Job']['InputData']:
            try:
                result = job.resolveInputData()
                if not result['OK']:
                    gLogger.warn(result['Message'])
                    raise JobWrapperError(result['Message'])
            except Exception, x:
                gLogger.exception('JobWrapper failed to resolve input data')
                rescheduleResult = rescheduleFailedJob(
                    jobID, 'Input Data Resolution', gJobReport)
                job.sendJobAccounting(rescheduleResult,
                                      'Input Data Resolution')
                return 1
        else:
            gLogger.verbose('Job has a null InputData requirement:')
            gLogger.verbose(arguments)