Beispiel #1
0
  def test_SSCChain( self ):
    """ full test of functionalities
    """
    ssc = SandboxStoreClient()
    smDB = SandboxMetadataDB()

    exeScriptLocation = find_all( 'exe-script.py', '.', 'WorkloadManagementSystem' )[0]
    fileList = [exeScriptLocation]
    res = ssc.uploadFilesAsSandbox( fileList )
    self.assert_( res['OK'] )
#     SEPFN = res['Value'].split( '|' )[1]
    res = ssc.uploadFilesAsSandboxForJob( fileList, 1, 'Input' )
    self.assert_( res['OK'] )
#     res = ssc.downloadSandboxForJob( 1, 'Input' ) #to run this would need the RSS on
#     self.assert_( res['OK'] )

    # only ones needing the DB
    res = smDB.getUnusedSandboxes()
    self.assert_( res['OK'] )
Beispiel #2
0
def test_SSCChain():
    """full test of functionalities"""
    ssc = SandboxStoreClient()
    smDB = SandboxMetadataDB()

    exeScriptLocation = find_all("exe-script.py", "..", "/DIRAC/tests/Integration")[0]
    fileList = [exeScriptLocation]
    res = ssc.uploadFilesAsSandbox(fileList)
    assert res["OK"] is True, res["Message"]
    #     SEPFN = res['Value'].split( '|' )[1]
    res = ssc.uploadFilesAsSandboxForJob(fileList, 1, "Input")
    assert res["OK"] is True, res["Message"]
    res = ssc.downloadSandboxForJob(1, "Input")  # to run this we need the RSS on
    print(res)  # for debug...
    assert res["OK"] is True, res["Message"]

    # only ones needing the DB
    res = smDB.getUnusedSandboxes()
    print(res)
    assert res["OK"] is True, res["Message"]
def test_SSCChain(self):
  """ full test of functionalities
  """
  ssc = SandboxStoreClient()
  smDB = SandboxMetadataDB()

  exeScriptLocation = find_all('exe-script.py', '..', '/DIRAC/tests/Integration')[0]
  fileList = [exeScriptLocation]
  res = ssc.uploadFilesAsSandbox(fileList)
  assert res['OK'] is True
#     SEPFN = res['Value'].split( '|' )[1]
  res = ssc.uploadFilesAsSandboxForJob(fileList, 1, 'Input')
  assert res['OK'] is True
  res = ssc.downloadSandboxForJob(1, 'Input')  # to run this we need the RSS on
  print res  # for debug...
  assert res['OK'] is True

  # only ones needing the DB
  res = smDB.getUnusedSandboxes()
  print res
  assert res['OK'] is True
Beispiel #4
0
def test_SSCChain(self):
    """ full test of functionalities
  """
    ssc = SandboxStoreClient()
    smDB = SandboxMetadataDB()

    exeScriptLocation = find_all('exe-script.py', '..',
                                 '/DIRAC/tests/Integration')[0]
    fileList = [exeScriptLocation]
    res = ssc.uploadFilesAsSandbox(fileList)
    assert res['OK'] is True
    #     SEPFN = res['Value'].split( '|' )[1]
    res = ssc.uploadFilesAsSandboxForJob(fileList, 1, 'Input')
    assert res['OK'] is True
    res = ssc.downloadSandboxForJob(1,
                                    'Input')  # to run this we need the RSS on
    print(res)  # for debug...
    assert res['OK'] is True

    # only ones needing the DB
    res = smDB.getUnusedSandboxes()
    print(res)
    assert res['OK'] is True
class InteractiveJobMonitorThread ( threading.Thread ):

    def __init__( self, user , publicIP, looptime , parentthread, output, getinfo ):
      threading.Thread.__init__( self )
      self.sshConnect = ConnectionUtils( user , publicIP )
      self.looptime = looptime
      self.parentthread = parentthread
      self.output = output
      self.getinfo = getinfo

      """
      #SandBox Settings
      """
      self.sandboxClient = SandboxStoreClient()
      self.failedFlag = True
      self.sandboxSizeLimit = 1024 * 1024 * 10

    def run( self ):
      self.log = gLogger.getSubLogger( "InteractiveJobMonitorThread" )
      self.monitoring( self.looptime, self.parentthread, self.output )

    def monitoring( self, loop, parentthread, output ):

      self.initialTiming = os.times()
      accountingReport = AccountingJob()
      accountingReport.setStartTime()

      numberJobsFlag = True
      numberJobs = 0
      numberStartedJobsDict = {}
      numberEndingJobsDict = {}

      job_pattern = re.compile( 'Job =.*?,' )
      job_pattern_2 = re.compile( 'Job =.*?\n' )
      jobid = int( re.split( "_", re.split( "/", output )[int( len( re.split( "/", output ) ) - 1 )] )[0] )

      cmd = '/bin/chmod 555 ' + self.getinfo
      returned = self.commandLaunch( cmd )

      while parentthread.isAlive():
        time.sleep( loop )
        if numberJobsFlag:
          cmd = self.getinfo + ' -c step1'
          returned = self.commandLaunch( cmd )
          self.log.info( 'InteractiveJobMonitorThread:step1:numJobs:', returned )
          if returned != None:
            if ( returned['Value'][1] != "" ):
              if re.split( "=", returned['Value'][1] )[1].strip().isdigit():
                numberJobs = int( re.split( "=", returned['Value'][1] )[1] )
            if ( numberJobs != 0 ):
              numberJobsFlag = False
              BigDataDB.setJobStatus( jobid, "Running" )
        else:
          cmd = self.getinfo + ' -c step2'
          returned = self.commandLaunch( cmd )
          self.log.info( 'InteractiveJobMonitorThread:step2:startedJobs:', returned )
          if returned != "":
            if ( returned['Value'][1] != "" ):
              startedJobs = job_pattern.findall( returned['Value'][1] )
              self.log.info( 'step2:startedJobs:', startedJobs )
          cmd = self.getinfo + ' -c step3'
          returned = self.commandLaunch( cmd )
          self.log.info( 'InteractiveJobMonitorThread:step3:endedJobs:', returned )
          if returned != "":
            if ( returned['Value'][1] != "" ):
              finishedJobs = job_pattern_2.findall( returned['Value'][1] )
              self.log.info( 'step3:finishedJobs:', finishedJobs )
              if ( len( finishedJobs ) == numberJobs ):
                BigDataDB.setJobStatus( jobid, "Done" )
                BigDataDB.setHadoopID( jobid, finishedJobs )
                self.__updateSandBox( jobid, output )

                #Update Accounting                
                EXECUTION_RESULT = {}
                EXECUTION_RESULT['CPU'] = []
                finalStat = os.times()
                for i in range( len( finalStat ) ):
                  EXECUTION_RESULT['CPU'].append( finalStat[i] - self.initialTiming[i] )
                utime, stime, cutime, cstime, elapsed = EXECUTION_RESULT['CPU']
                cpuTime = utime + stime + cutime + cstime
                execTime = elapsed
                result = jobDB.getJobAttributes( jobid )
                getting = result['Value']
                acData = {
                        'User' : getting['Owner'],
                        'UserGroup' : getting['OwnerGroup'],
                        'JobGroup' : 'cesga',
                        'JobType' : 'User',
                        'JobClass' : 'unknown',
                        'ProcessingType' : 'unknown',
                        'FinalMajorStatus' : getting['Status'],
                        'FinalMinorStatus' : getting['MinorStatus'],
                        'CPUTime' : cpuTime,
                        'Site' : getting['Site'],
                        # Based on the factor to convert raw CPU to Normalized units (based on the CPU Model)
                        'NormCPUTime' : 0,
                        'ExecTime' : cpuTime,
                        'InputDataSize' : 0,
                        'OutputDataSize' : 0,
                        'InputDataFiles' : 0,
                        'OutputDataFiles' : 0,
                        'DiskSpace' : 0,
                        'InputSandBoxSize' : 0,
                        'OutputSandBoxSize' : 0,
                        'ProcessedEvents' : 0
                        }
                accountingReport.setEndTime()
                accountingReport.setValuesFromDict( acData )
                result = accountingReport.commit()



    def commandLaunch( self, cmd ):
      return self.sshConnect.sshCall( 100, cmd )

    def __updateSandBox( self, jobid, output ):

      jobInfo = BigDataDB.getJobIDInfo( jobid )
      result = self.sshConnect.scpCall( 100, output, output, False )

      if not result['OK']:
        self.log.error( 'Error to get the data from BigData Software DFS:', result )

      file_paths = []
      file_paths.append( output )
      outputSandbox = file_paths

      resolvedSandbox = self.__resolveOutputSandboxFiles( outputSandbox )
      if not resolvedSandbox['OK']:
        self.log.warn( 'Output sandbox file resolution failed:' )
        self.log.warn( resolvedSandbox['Message'] )
        self.__report( 'Failed', 'Resolving Output Sandbox' )
      fileList = resolvedSandbox['Value']['Files']
      missingFiles = resolvedSandbox['Value']['Missing']
      if missingFiles:
        self.jobReport.setJobParameter( 'OutputSandboxMissingFiles', ', '.join( missingFiles ), sendFlag = False )

      if fileList and jobid:
        self.outputSandboxSize = getGlobbedTotalSize( fileList )
        self.log.info( 'Attempting to upload Sandbox with limit:', self.sandboxSizeLimit )

        result = self.sandboxClient.uploadFilesAsSandboxForJob( fileList, jobid,
                                                           'Output', self.sandboxSizeLimit ) # 1024*1024*10
        if not result['OK']:
          self.log.error( 'Output sandbox upload failed with message', result['Message'] )
          if result.has_key( 'SandboxFileName' ):
            outputSandboxData = result['SandboxFileName']
            self.log.info( 'Attempting to upload %s as output data' % ( outputSandboxData ) )
            outputData.append( outputSandboxData )
            self.jobReport.setJobParameter( 'OutputSandbox', 'Sandbox uploaded to grid storage', sendFlag = False )
            self.jobReport.setJobParameter( 'OutputSandboxLFN',
                                            self.__getLFNfromOutputFile( outputSandboxData )[0], sendFlag = False )
          else:
            self.log.info( 'Could not get SandboxFileName to attempt upload to Grid storage' )
            return S_ERROR( 'Output sandbox upload failed and no file name supplied for failover to Grid storage' )
        else:
          # Do not overwrite in case of Error
          if not self.failedFlag:
            self.__report( 'Completed', 'Output Sandbox Uploaded' )
          self.log.info( 'Sandbox uploaded successfully' )

      return "OK"

    def __resolveOutputSandboxFiles( self, outputSandbox ):
      """Checks the output sandbox file list and resolves any specified wildcards.
         Also tars any specified directories.
      """
      missing = []
      okFiles = []
      for i in outputSandbox:
        self.log.verbose( 'Looking at OutputSandbox file/directory/wildcard: %s' % i )
        globList = glob.glob( i )
        for check in globList:
          if os.path.isfile( check ):
            self.log.verbose( 'Found locally existing OutputSandbox file: %s' % check )
            okFiles.append( check )
          if os.path.isdir( check ):
            self.log.verbose( 'Found locally existing OutputSandbox directory: %s' % check )
            cmd = ['tar', 'cf', '%s.tar' % check, check]
            result = systemCall( 60, cmd )
            if not result['OK']:
              self.log.error( 'Failed to create OutputSandbox tar', result['Message'] )
            elif result['Value'][0]:
              self.log.error( 'Failed to create OutputSandbox tar', result['Value'][2] )
            if os.path.isfile( '%s.tar' % ( check ) ):
              self.log.verbose( 'Appending %s.tar to OutputSandbox' % check )
              okFiles.append( '%s.tar' % ( check ) )
            else:
              self.log.warn( 'Could not tar OutputSandbox directory: %s' % check )
              missing.append( check )

      for i in outputSandbox:
        if not i in okFiles:
          if not '%s.tar' % i in okFiles:
            if not re.search( '\*', i ):
              if not i in missing:
                missing.append( i )

      result = {'Missing':missing, 'Files':okFiles}
      return S_OK( result )
class BigDataJobMonitoring(AgentModule):
    def initialize(self):
        """ Standard constructor
    """
        import threading

        self.am_setOption("PollingTime", 5)

        self.am_setOption("ThreadStartDelay", 1)
        self.am_setOption("SubmitPools", [])
        self.am_setOption("DefaultSubmitPools", [])

        self.am_setOption("minThreadsInPool", 0)
        self.am_setOption("maxThreadsInPool", 2)
        self.am_setOption("totalThreadsInPool", 40)

        self.callBackLock = threading.Lock()
        self.pendingJobs = {}
        self.monitoringEndPoints = {}

        """
    #SandBox Settings
    """
        self.__tmpSandBoxDir = "/tmp/"
        self.sandboxClient = SandboxStoreClient()
        self.failedFlag = True
        self.sandboxSizeLimit = 1024 * 1024 * 10
        self.fileList = 0
        self.outputSandboxSize = 0

        self.cleanDataAfterFinish = True

        return DIRAC.S_OK()

    def execute(self):
        """Main Agent code:
      1.- Query BigDataDB for existing Running, Queue, or Submitted jobs
      2.- Ask about the status
      3.- Change the status into DB in the case of had changed
    """

        self.pendingJobs["Submitted"] = BigDataDB.getBigDataJobsByStatus("Submitted")
        self.pendingJobs["Running"] = BigDataDB.getBigDataJobsByStatus("Running")
        self.pendingJobs["Unknown"] = BigDataDB.getBigDataJobsByStatus("Unknown")

        self.__getMonitoringPools()
        self.log.verbose("monitoring pools", self.monitoringEndPoints)

        for status in self.pendingJobs:
            self.log.verbose("Analizing %s jobs" % status)
            JobStatus = 0
            if self.pendingJobs[status]["OK"]:
                for jobId in self.pendingJobs[status]["Value"]:
                    self.log.verbose("Analizing job %s" % jobId)
                    getSoftIdAndSiteName = BigDataDB.getSoftwareJobIDByJobID(jobId[0])
                    self.log.verbose("Site and SoftID:", getSoftIdAndSiteName)
                    for runningEndPoint in self.monitoringEndPoints:
                        if (self.monitoringEndPoints[runningEndPoint]["NameNode"] == getSoftIdAndSiteName[0][1]) and (
                            getSoftIdAndSiteName[0][0] != ""
                        ):
                            # Depending on the BigData Software the Query should be different
                            if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop":
                                if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv1":
                                    if (
                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"]
                                        == "none"
                                    ):
                                        self.log.info(
                                            "Hadoop V.1 Monitoring submmission command with Hadoop jobID: ",
                                            getSoftIdAndSiteName[0][0],
                                        )
                                        from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV1Client import (
                                            HadoopV1Client,
                                        )

                                        HadoopV1cli = HadoopV1Client(
                                            self.monitoringEndPoints[runningEndPoint]["User"],
                                            self.monitoringEndPoints[runningEndPoint]["PublicIP"],
                                            self.monitoringEndPoints[runningEndPoint]["Port"],
                                        )
                                        JobStatus = HadoopV1cli.jobStatus(
                                            getSoftIdAndSiteName[0][0],
                                            self.monitoringEndPoints[runningEndPoint]["User"],
                                            self.monitoringEndPoints[runningEndPoint]["PublicIP"],
                                        )
                                        if (JobStatus["OK"] == True) and (
                                            self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1"
                                        ):
                                            if JobStatus["Value"][1].strip() == "Succeded":
                                                result = HadoopV1cli.newJob(
                                                    self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0]
                                                )

                                                if result["OK"] == True:
                                                    result = BigDataDB.updateHadoopIDAndJobStatus(
                                                        jobId[0], result["Value"]
                                                    )
                                                    BigDataDB.setJobStatus(jobId[0], "Running")
                                                    JobStatus["OK"] = False
                                                else:
                                                    self.log.info("New result from new Job", result)
                                        if JobStatus["OK"] == True:
                                            if JobStatus["Value"][1].strip() == "Succeded":
                                                BigDataDB.setJobStatus(jobId[0], "Done")
                                                if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1":
                                                    self.__updateInteractiveSandBox(
                                                        jobId[0],
                                                        self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"],
                                                        self.monitoringEndPoints[runningEndPoint][
                                                            "BigDataSoftwareVersion"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLName"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLVersion"
                                                        ],
                                                        HadoopV1cli,
                                                    )
                                                else:
                                                    self.__updateSandBox(
                                                        jobId[0],
                                                        self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"],
                                                        self.monitoringEndPoints[runningEndPoint][
                                                            "BigDataSoftwareVersion"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLName"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLVersion"
                                                        ],
                                                        HadoopV1cli,
                                                    )
                                                getStatus = HadoopV1cli.jobCompleteStatus(getSoftIdAndSiteName[0][0])
                                                if getStatus["OK"]:
                                                    result = self.getJobFinalStatusInfo(getStatus["Value"][1])
                                                if result["OK"]:
                                                    self.sendJobAccounting(result["Value"], jobId[0])
                                                if self.cleanDataAfterFinish:
                                                    self.__deleteData(jobId[0], HadoopV1cli)
                                            if JobStatus["Value"][1].strip() == "Unknown":
                                                BigDataDB.setJobStatus(jobId[0], "Submitted")
                                            if JobStatus["Value"][1].strip() == "Running":
                                                BigDataDB.setJobStatus(jobId[0], "Running")

                            if self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"] == "hadoop":
                                if self.monitoringEndPoints[runningEndPoint]["BigDataSoftwareVersion"] == "hdv2":
                                    if (
                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"]["HLLName"]
                                        == "none"
                                    ):
                                        self.log.info(
                                            "Hadoop V.2 Monitoring submmission command with Hadoop jobID: ",
                                            getSoftIdAndSiteName[0][0],
                                        )
                                        from BigDataDIRAC.WorkloadManagementSystem.Client.HadoopV2Client import (
                                            HadoopV2Client,
                                        )

                                        HadoopV2cli = HadoopV2Client(
                                            self.monitoringEndPoints[runningEndPoint]["User"],
                                            self.monitoringEndPoints[runningEndPoint]["PublicIP"],
                                        )
                                        JobStatus = HadoopV2cli.jobStatus(
                                            getSoftIdAndSiteName[0][0],
                                            self.monitoringEndPoints[runningEndPoint]["User"],
                                            self.monitoringEndPoints[runningEndPoint]["PublicIP"],
                                        )
                                        if (JobStatus["OK"] == True) and (
                                            self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1"
                                        ):
                                            if JobStatus["Value"].strip() == "Succeded":
                                                result = HadoopV2cli.newJob(
                                                    self.__tmpSandBoxDir, jobId[0], getSoftIdAndSiteName[0][0]
                                                )

                                                if result["OK"] == True:
                                                    result = BigDataDB.updateHadoopIDAndJobStatus(
                                                        jobId[0], result["Value"]
                                                    )
                                                    BigDataDB.setJobStatus(jobId[0], "Running")
                                                    JobStatus["OK"] = False
                                                else:
                                                    self.log.info("New result from new Job", result)
                                        if JobStatus["OK"] == True:
                                            if JobStatus["Value"] == "Succeded":
                                                BigDataDB.setJobStatus(jobId[0], "Done")
                                                if self.monitoringEndPoints[runningEndPoint]["IsInteractive"] == "1":
                                                    self.__updateInteractiveSandBox(
                                                        jobId[0],
                                                        self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"],
                                                        self.monitoringEndPoints[runningEndPoint][
                                                            "BigDataSoftwareVersion"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLName"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLVersion"
                                                        ],
                                                        HadoopV2cli,
                                                    )
                                                else:
                                                    self.__updateSandBox(
                                                        jobId[0],
                                                        self.monitoringEndPoints[runningEndPoint]["BigDataSoftware"],
                                                        self.monitoringEndPoints[runningEndPoint][
                                                            "BigDataSoftwareVersion"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLName"
                                                        ],
                                                        self.monitoringEndPoints[runningEndPoint]["HighLevelLanguage"][
                                                            "HLLVersion"
                                                        ],
                                                        HadoopV2cli,
                                                    )
                                                getStatus = HadoopV2cli.jobCompleteStatus(getSoftIdAndSiteName[0][0])
                                                if getStatus["OK"]:
                                                    result = self.getJobFinalStatusInfo(getStatus["Value"][1])
                                                if result["OK"]:
                                                    self.sendJobAccounting(result["Value"], jobId[0])
                                                # if self.cleanDataAfterFinish:
                                                #  self.__deleteData( jobId[0], HadoopV2cli )
                                            if JobStatus["Value"] == "Unknown":
                                                BigDataDB.setJobStatus(jobId[0], "Submitted")
                                            if JobStatus["Value"] == "Running":
                                                BigDataDB.setJobStatus(jobId[0], "Running")
        return DIRAC.S_OK()

    def sendJobAccounting(self, dataFromBDSoft, jobId):
        accountingReport = AccountingJob()
        accountingReport.setStartTime()

        result = jobDB.getJobAttributes(jobId)
        getting = result["Value"]
        if dataFromBDSoft["CPUTime"] == 0:
            cpuTime = 0
            if getting["EndExecTime"] != "None":
                epoch = datetime(1970, 1, 1)
                td = datetime.strptime(getting["EndExecTime"], "%Y-%m-%d %H:%M:%S") - epoch
                EndExecTime = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 1e6
                td = datetime.strptime(getting["SubmissionTime"], "%Y-%m-%d %H:%M:%S") - epoch
                SubmissionTime = (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10 ** 6) / 1e6
                cpuTime = EndExecTime - SubmissionTime
        else:
            cpuTime = dataFromBDSoft["CPUTime"] / 1000

        acData = {
            "User": getting["Owner"],
            "UserGroup": getting["OwnerGroup"],
            "JobGroup": "cesga",
            "JobType": "User",
            "JobClass": "unknown",
            "ProcessingType": "unknown",
            "FinalMajorStatus": getting["Status"],
            "FinalMinorStatus": getting["MinorStatus"],
            "CPUTime": cpuTime,
            "Site": getting["Site"],
            # Based on the factor to convert raw CPU to Normalized units (based on the CPU Model)
            "NormCPUTime": 0,
            "ExecTime": cpuTime,
            "InputDataSize": dataFromBDSoft["InputDataSize"],
            "OutputDataSize": dataFromBDSoft["OutputDataSize"],
            "InputDataFiles": dataFromBDSoft["InputDataFiles"],
            "OutputDataFiles": len(self.fileList),
            "DiskSpace": 0,
            "InputSandBoxSize": 0,
            "OutputSandBoxSize": self.outputSandboxSize,
            "ProcessedEvents": 0,
        }
        accountingReport.setEndTime()
        accountingReport.setValuesFromDict(acData)
        self.log.debug("Info for accounting: ", acData)
        result = accountingReport.commit()
        self.log.debug("Accounting insertion: ", result)
        return result

    def getJobFinalStatusInfo(self, jobData):
        JobOutputInfo = {}

        resulting = re.search("Read=(\d+)", jobData)
        if resulting != None:
            JobOutputInfo["InputDataSize"] = int(resulting.group(0).split("=")[1])
        else:
            JobOutputInfo["InputDataSize"] = 0

        resulting = re.search("Written=(\d+)", jobData)
        if resulting != None:
            JobOutputInfo["OutputDataSize"] = int(resulting.group(0).split("=")[1])
        else:
            JobOutputInfo["OutputDataSize"] = 0

        resulting = re.search("Map input records=(\d+)", jobData)
        if resulting != None:
            JobOutputInfo["InputDataFiles"] = int(resulting.group(0).split("=")[1])
        else:
            JobOutputInfo["InputDataFiles"] = 0

        resulting = re.search("CPU.*?=(\d+)", jobData)
        if resulting != None:
            JobOutputInfo["CPUTime"] = int(resulting.group(0).split("=")[1])
        else:
            JobOutputInfo["CPUTime"] = 0

        JobOutputInfo["ExecTime"] = 0

        return S_OK(JobOutputInfo)

    def __deleteData(self, jobid, cli):
        source = self.__tmpSandBoxDir + str(jobid)
        shutil.rmtree(source)
        result = cli.delData(source)
        if not result["OK"]:
            self.log.error("Error the data on BigData cluster could not be deleted", result)
            return S_ERROR("Data can not be deleted")
        return "Data deleted"

    def __updateInteractiveSandBox(self, jobid, software, version, hll, hllversion, cli):
        # Detele content of InputSandbox

        jobInfo = BigDataDB.getJobIDInfo(jobid)
        source = self.__tmpSandBoxDir + str(jobid) + "/*_out"
        dest = self.__tmpSandBoxDir + str(jobid)
        result = 0

        result = cli.delHadoopData(self.__tmpSandBoxDir + str(jobid) + "/InputSandbox" + str(jobid))
        self.log.debug("ATENTION::Deleting InputSandBox Contain:", result)

        result = cli.getdata(dest, source)
        self.log.debug("Step 0:getting data from hadoop:", result)
        if not result["OK"]:
            self.log.error("Error to get the data from BigData Cluster to DIRAC:", result)

        self.log.debug("Step:1:GetFilePaths:")
        outputSandbox = self.get_filepaths(self.__tmpSandBoxDir + str(jobid))
        self.log.debug("Step:2:OutputSandBox:", self.__tmpSandBoxDir + str(jobid))
        self.log.debug("Step:2:OutputSandBox:", outputSandbox)
        resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox)

        self.log.debug("Step:3:ResolveSandbox:", resolvedSandbox)
        if not resolvedSandbox["OK"]:
            self.log.warn("Output sandbox file resolution failed:")
            self.log.warn(resolvedSandbox["Message"])
            self.__report("Failed", "Resolving Output Sandbox")
        self.fileList = resolvedSandbox["Value"]["Files"]
        missingFiles = resolvedSandbox["Value"]["Missing"]
        if missingFiles:
            self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False)

        if self.fileList and jobid:
            self.outputSandboxSize = getGlobbedTotalSize(self.fileList)
            self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit)

            result = self.sandboxClient.uploadFilesAsSandboxForJob(
                self.fileList, jobid, "Output", self.sandboxSizeLimit
            )  # 1024*1024*10
            if not result["OK"]:
                self.log.error("Output sandbox upload failed with message", result["Message"])
                if result.has_key("SandboxFileName"):
                    outputSandboxData = result["SandboxFileName"]
                    self.log.info("Attempting to upload %s as output data" % (outputSandboxData))
                    outputData.append(outputSandboxData)
                    self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False)
                    self.jobReport.setJobParameter(
                        "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False
                    )
                else:
                    self.log.info("Could not get SandboxFileName to attempt upload to Grid storage")
                    return S_ERROR(
                        "Output sandbox upload failed and no file name supplied for failover to Grid storage"
                    )
            else:
                # Do not overwrite in case of Error
                if not self.failedFlag:
                    self.__report("Completed", "Output Sandbox Uploaded")
                self.log.info("Sandbox uploaded successfully")

        return "OK"

    def __updateSandBox(self, jobid, software, version, hll, hllversion, cli):
        jobInfo = BigDataDB.getJobIDInfo(jobid)

        source = (
            self.__tmpSandBoxDir
            + str(jobid)
            + "/InputSandbox"
            + str(jobid)
            + "/"
            + self.__getJobName(jobInfo[0][0]).replace(" ", "")
            + "_"
            + str(jobid)
        )
        dest = (
            self.__tmpSandBoxDir
            + str(jobid)
            + "/"
            + self.__getJobName(jobInfo[0][0]).replace(" ", "")
            + "_"
            + str(jobid)
        )
        result = 0
        if (software == "hadoop") and (version == "hdv1") and (hll == "none"):
            result = cli.getData(source, dest)
        if (software == "hadoop") and (version == "hdv2") and (hll == "none"):
            result = cli.getData(source, dest)
        if not result["OK"]:
            self.log.error("Error to get the data from BigData Software DFS:", result)

        result = cli.getdata(dest, dest)
        if not result["OK"]:
            self.log.error("Error to get the data from BigData Cluster to DIRAC:", result)

        outputSandbox = self.get_filepaths(dest)

        resolvedSandbox = self.__resolveOutputSandboxFiles(outputSandbox)
        if not resolvedSandbox["OK"]:
            self.log.warn("Output sandbox file resolution failed:")
            self.log.warn(resolvedSandbox["Message"])
            self.__report("Failed", "Resolving Output Sandbox")
        self.fileList = resolvedSandbox["Value"]["Files"]
        missingFiles = resolvedSandbox["Value"]["Missing"]
        if missingFiles:
            self.jobReport.setJobParameter("OutputSandboxMissingFiles", ", ".join(missingFiles), sendFlag=False)

        if self.fileList and jobid:
            self.outputSandboxSize = getGlobbedTotalSize(self.fileList)
            self.log.info("Attempting to upload Sandbox with limit:", self.sandboxSizeLimit)

            result = self.sandboxClient.uploadFilesAsSandboxForJob(
                self.fileList, jobid, "Output", self.sandboxSizeLimit
            )  # 1024*1024*10
            if not result["OK"]:
                self.log.error("Output sandbox upload failed with message", result["Message"])
                if result.has_key("SandboxFileName"):
                    outputSandboxData = result["SandboxFileName"]
                    self.log.info("Attempting to upload %s as output data" % (outputSandboxData))
                    outputData.append(outputSandboxData)
                    self.jobReport.setJobParameter("OutputSandbox", "Sandbox uploaded to grid storage", sendFlag=False)
                    self.jobReport.setJobParameter(
                        "OutputSandboxLFN", self.__getLFNfromOutputFile(outputSandboxData)[0], sendFlag=False
                    )
                else:
                    self.log.info("Could not get SandboxFileName to attempt upload to Grid storage")
                    return S_ERROR(
                        "Output sandbox upload failed and no file name supplied for failover to Grid storage"
                    )
            else:
                # Do not overwrite in case of Error
                if not self.failedFlag:
                    self.__report("Completed", "Output Sandbox Uploaded")
                self.log.info("Sandbox uploaded successfully")

        return "OK"

    def __getLFNfromOutputFile(self, outputFile, outputPath=""):
        """Provides a generic convention for VO output data
       files if no path is specified.
    """

        if not re.search("^LFN:", outputFile):
            localfile = outputFile
            initial = self.owner[:1]
            vo = getVOForGroup(self.userGroup)
            if not vo:
                vo = "dirac"
            basePath = "/" + vo + "/user/" + initial + "/" + self.owner
            if outputPath:
                # If output path is given, append it to the user path and put output files in this directory
                if outputPath.startswith("/"):
                    outputPath = outputPath[1:]
            else:
                # By default the output path is constructed from the job id
                subdir = str(self.jobID / 1000)
                outputPath = subdir + "/" + str(self.jobID)
            lfn = os.path.join(basePath, outputPath, os.path.basename(localfile))
        else:
            # if LFN is given, take it as it is
            localfile = os.path.basename(outputFile.replace("LFN:", ""))
            lfn = outputFile.replace("LFN:", "")

        return (lfn, localfile)

    def get_filepaths(self, directory):
        """
    This function will generate the file names in a directory
    """
        file_paths = []
        for root, directories, files in os.walk(directory):
            for filename in files:
                filepath = os.path.join(root, filename)
                file_paths.append(filepath)
        return file_paths

    def __resolveOutputSandboxFiles(self, outputSandbox):
        """Checks the output sandbox file list and resolves any specified wildcards.
       Also tars any specified directories.
    """
        missing = []
        okFiles = []
        for i in outputSandbox:
            self.log.verbose("Looking at OutputSandbox file/directory/wildcard: %s" % i)
            globList = glob.glob(i)
            for check in globList:
                if os.path.isfile(check):
                    self.log.verbose("Found locally existing OutputSandbox file: %s" % check)
                    okFiles.append(check)
                if os.path.isdir(check):
                    self.log.verbose("Found locally existing OutputSandbox directory: %s" % check)
                    cmd = ["tar", "cf", "%s.tar" % check, check]
                    result = systemCall(60, cmd)
                    if not result["OK"]:
                        self.log.error("Failed to create OutputSandbox tar", result["Message"])
                    elif result["Value"][0]:
                        self.log.error("Failed to create OutputSandbox tar", result["Value"][2])
                    if os.path.isfile("%s.tar" % (check)):
                        self.log.verbose("Appending %s.tar to OutputSandbox" % check)
                        okFiles.append("%s.tar" % (check))
                    else:
                        self.log.warn("Could not tar OutputSandbox directory: %s" % check)
                        missing.append(check)

        for i in outputSandbox:
            if not i in okFiles:
                if not "%s.tar" % i in okFiles:
                    if not re.search("\*", i):
                        if not i in missing:
                            missing.append(i)

        result = {"Missing": missing, "Files": okFiles}
        return S_OK(result)

    def __getJobName(self, jobName):
        result = re.split("_", jobName)
        return result[0]

    def __getMonitoringPools(self):

        for monitoringPool in self.am_getOption("SubmitPools"):
            self.log.verbose("Monitoring Pools", monitoringPool)
            pathPools = self.am_getModuleParam("section") + "/" + monitoringPool + "/EndPointMonitoring"
            monitorings = gConfig.getValue(pathPools)
            splitted = re.split(",", monitorings)
            for endpoint in splitted:
                self.configureFromSection("/Resources/BigDataEndpoints/", endpoint)

        return "OK"

    def configureFromSection(self, mySection, endPoint):
        """
      get CS for monitoring endpoints
    """
        self.log.debug("Configuring from %s" % mySection)

        monitoringBDEndPointDict = BigDataDB.getRunningEnPointDict(endPoint)
        if not monitoringBDEndPointDict["OK"]:
            self.log.error("Error in RunninggBDEndPointDict: %s" % monitoringBDEndPointDict["Message"])
            return monitoringBDEndPointDict
        self.log.verbose("Trying to configure RunningBDEndPointDict:", monitoringBDEndPointDict)
        monitoringBDEndPointDict = monitoringBDEndPointDict["Value"]
        for option in [
            "NameNode",
            "Port",
            "SiteName",
            "BigDataSoftware",
            "BigDataSoftwareVersion",
            "HighLevelLanguage",
            "LimitQueueJobsEndPoint",
            "URL",
            "PublicIP",
        ]:
            if option not in monitoringBDEndPointDict.keys():
                self.log.error('Missing option in "%s" EndPoint definition:' % endPoint, option)
                continue

        self.monitoringEndPoints[endPoint] = {}
        self.monitoringEndPoints[endPoint]["NameNode"] = monitoringBDEndPointDict["NameNode"]
        self.monitoringEndPoints[endPoint]["Port"] = int(monitoringBDEndPointDict["Port"])
        self.monitoringEndPoints[endPoint]["SiteName"] = monitoringBDEndPointDict["SiteName"]
        self.monitoringEndPoints[endPoint]["BigDataSoftware"] = monitoringBDEndPointDict["BigDataSoftware"]
        self.monitoringEndPoints[endPoint]["BigDataSoftwareVersion"] = monitoringBDEndPointDict[
            "BigDataSoftwareVersion"
        ]
        self.monitoringEndPoints[endPoint]["LimitQueueJobsEndPoint"] = int(
            monitoringBDEndPointDict["LimitQueueJobsEndPoint"]
        )
        self.monitoringEndPoints[endPoint]["URL"] = monitoringBDEndPointDict["URL"]
        self.monitoringEndPoints[endPoint]["User"] = monitoringBDEndPointDict["User"]
        self.monitoringEndPoints[endPoint]["PublicIP"] = monitoringBDEndPointDict["PublicIP"]
        self.monitoringEndPoints[endPoint]["IsInteractive"] = monitoringBDEndPointDict["IsInteractive"]

        self.monitoringEndPoints[endPoint]["HighLevelLanguage"] = monitoringBDEndPointDict["HighLevelLanguage"]