예제 #1
0
  def test_matcher( self ):
    # insert a proper DN to run the test
    resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                           'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB',
                           'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO',
                           'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 }
    matcher = RPCClient( 'WorkloadManagement/Matcher' )
    JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    wmsClient = WMSClient()

    job = helloWorldJob()
    job.setDestination( 'DIRAC.Jenkins.org' )
    job.setInputData( '/a/bbb' )
    job.setType( 'User' )
    jobDescription = createFile( job )
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )

    jobID = res['Value']

    res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' )
    self.assert_( res['OK'] )


    tqDB = TaskQueueDB()
    tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                 'OwnerGroup':'prod', 'Setup':'JenkinsSetup', 'CPUTime':86400}
    res = tqDB.insertJob( jobID, tqDefDict, 10 )
    self.assert_( res['OK'] )

    res = matcher.requestJob( resourceDescription )
    print res
    self.assert_( res['OK'] )
    wmsClient.deleteJob( jobID )
예제 #2
0
  def test_matcher( self ):
    # insert a proper DN to run the test
    resourceDescription = {'OwnerGroup': 'prod', 'OwnerDN':'/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                           'DIRACVersion': 'pippo', 'ReleaseVersion':'blabla', 'VirtualOrganization':'LHCB',
                           'PilotInfoReportedFlag':'True', 'PilotBenchmark':'anotherPilot', 'LHCbPlatform':'CERTO',
                           'Site':'DIRAC.Jenkins.org', 'CPUTime' : 86400 }
    matcher = RPCClient( 'WorkloadManagement/Matcher' )
    JobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )
    wmsClient = WMSClient()

    job = helloWorldJob()
    job.setDestination( 'DIRAC.Jenkins.org' )
    job.setInputData( '/a/bbb' )
    job.setType( 'User' )
    jobDescription = createFile( job )
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )

    jobID = res['Value']

    res = JobStateUpdate.setJobStatus( jobID, 'Waiting', 'matching', 'source' )
    self.assert_( res['OK'] )


    tqDB = TaskQueueDB()
    tqDefDict = {'OwnerDN': '/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser/[email protected]',
                 'OwnerGroup':'prod', 'Setup':'dirac-JenkinsSetup', 'CPUTime':86400}
    res = tqDB.insertJob( jobID, tqDefDict, 10 )
    self.assert_( res['OK'] )

    res = matcher.requestJob( resourceDescription )
    print res
    self.assert_( res['OK'] )
    wmsClient.deleteJob( jobID )
예제 #3
0
  def test_ParametricChain(self):
    """ This test will submit a parametric job which should generate 3 actual jobs
    """
    wmsClient = WMSClient()
    jobStateUpdate = JobStateUpdateClient()
    jobMonitor = JobMonitoringClient()

    # create the job
    job = parametricJob()
    jobDescription = createFile(job)

    # submit the job
    result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    self.assertTrue(result['OK'])
    jobIDList = result['Value']
    self.assertEqual(len(jobIDList), 3)

    result = jobMonitor.getJobsParameters(jobIDList, ['JobName'])
    self.assertTrue(result['OK'])
    jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']]
    self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)]))

    for jobID in jobIDList:
      result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
      self.assertTrue(result['OK'])

    result = wmsClient.deleteJob(jobIDList)
    self.assertTrue(result['OK'])

    for jobID in jobIDList:
      result = jobMonitor.getJobStatus(jobID)
      self.assertTrue(result['OK'])
      self.assertEqual(result['Value'], 'Deleted')
예제 #4
0
  def test_ParametricChain(self):
    """ This test will submit a parametric job which should generate 3 actual jobs
    """
    wmsClient = WMSClient()
    jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')
    jobMonitor = JobMonitoringClient()

    # create the job
    job = parametricJob()
    jobDescription = createFile(job)

    # submit the job
    result = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
    self.assertTrue(result['OK'])
    jobIDList = result['Value']
    self.assertEqual(len(jobIDList), 3)

    result = jobMonitor.getJobsParameters(jobIDList, ['JobName'])
    self.assertTrue(result['OK'])
    jobNames = [result['Value'][jobID]['JobName'] for jobID in result['Value']]
    self.assertEqual(set(jobNames), set(['parametric_helloWorld_%s' % nJob for nJob in range(3)]))

    for jobID in jobIDList:
      result = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
      self.assertTrue(result['OK'])

    result = wmsClient.deleteJob(jobIDList)
    self.assertTrue(result['OK'])

    for jobID in jobIDList:
      result = jobMonitor.getJobStatus(jobID)
      self.assertTrue(result['OK'])
      self.assertEqual(result['Value'], 'Deleted')
예제 #5
0
    def test_matcher(self):
        # insert a proper DN to run the test
        resourceDescription = {
            "OwnerGroup": "prod",
            "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser",
            "DIRACVersion": "pippo",
            "GridCE": "some.grid.ce.org",
            "ReleaseVersion": "blabla",
            "VirtualOrganization": "LHCb",
            "PilotInfoReportedFlag": "True",
            "PilotBenchmark": "anotherPilot",
            "Site": "DIRAC.Jenkins.ch",
            "CPUTime": 86400,
        }
        wmsClient = WMSClient()

        job = helloWorldJob()
        job.setDestination("DIRAC.Jenkins.ch")
        job.setInputData("/a/bbb")
        job.setType("User")
        jobDescription = createFile(job)
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))

        jobID = res["Value"]

        # forcing the update
        res = JobStateUpdateClient().setJobStatus(jobID, JobStatus.WAITING,
                                                  "matching", "source", None,
                                                  True)
        self.assertTrue(res["OK"], res.get("Message"))

        tqDB = TaskQueueDB()
        tqDefDict = {
            "OwnerDN": "/C=ch/O=DIRAC/OU=DIRAC CI/CN=ciuser",
            "OwnerGroup": "prod",
            "Setup": "dirac-JenkinsSetup",
            "CPUTime": 86400,
        }
        res = tqDB.insertJob(jobID, tqDefDict, 10)
        self.assertTrue(res["OK"], res.get("Message"))

        res = MatcherClient().requestJob(resourceDescription)
        print(res)
        self.assertTrue(res["OK"], res.get("Message"))
        wmsClient.deleteJob(jobID)
예제 #6
0
    def deleteJobsByStatus(self, condDict, delay=False):
        """Sets the job status to "DELETED" for jobs in condDict.

        :param dict condDict: a dict like {'JobType': 'User', 'Status': 'Killed'}
        :param int delay: days of delay
        :returns: S_OK/S_ERROR
        """

        res = self._getJobsList(condDict, delay)
        if not res["OK"]:
            return res
        jobList = res["Value"]
        if not jobList:
            return S_OK()

        self.log.notice("Attempting to delete jobs",
                        "(%d for %s)" % (len(jobList), condDict))

        result = self.deleteJobOversizedSandbox(
            jobList)  # This might set a request
        if not result["OK"]:
            self.log.error("Cannot schedule removal of oversized sandboxes",
                           result["Message"])
            return result

        failedJobs = result["Value"][JobStatus.FAILED]
        for job in failedJobs:
            jobList.pop(jobList.index(job))
        if not jobList:
            return S_OK()

        ownerJobsDict = self._getOwnerJobsDict(jobList)

        fail = False
        for owner, jobsList in ownerJobsDict.items():
            ownerDN = owner.split(";")[0]
            ownerGroup = owner.split(";")[1]
            self.log.verbose(
                "Attempting to delete jobs",
                "(n=%d) for %s : %s" % (len(jobsList), ownerDN, ownerGroup))
            wmsClient = WMSClient(useCertificates=True,
                                  delegatedDN=ownerDN,
                                  delegatedGroup=ownerGroup)
            result = wmsClient.deleteJob(jobsList)
            if not result["OK"]:
                self.log.error(
                    "Could not delete jobs",
                    "for %s : %s (n=%d) : %s" %
                    (ownerDN, ownerGroup, len(jobsList), result["Message"]),
                )
                fail = True

        if fail:
            return S_ERROR()

        return S_OK()
예제 #7
0
    def test_ParametricChain(self):
        """This test will submit a parametric job which should generate 3 actual jobs"""
        wmsClient = WMSClient()
        jobStateUpdate = JobStateUpdateClient()
        jobMonitor = JobMonitoringClient()

        # create the job
        job = parametricJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        jobIDList = res["Value"]
        self.assertEqual(len(jobIDList), 3, msg="Got %s" % str(jobIDList))

        res = jobMonitor.getJobsParameters(jobIDList, ["JobName"])
        self.assertTrue(res["OK"], res.get("Message"))
        jobNames = [res["Value"][jobID]["JobName"] for jobID in res["Value"]]
        self.assertEqual(
            set(jobNames),
            set(["parametric_helloWorld_%s" % nJob for nJob in range(3)]))

        for jobID in jobIDList:
            res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING,
                                              "checking", "source")
            self.assertTrue(res["OK"], res.get("Message"))

        res = wmsClient.deleteJob(jobIDList)
        self.assertTrue(res["OK"], res.get("Message"))
        print(res)

        for jobID in jobIDList:
            res = jobMonitor.getJobsStatus(jobID)
            self.assertTrue(res["OK"], res.get("Message"))
            self.assertEqual(res["Value"][jobID]["Status"],
                             JobStatus.DELETED,
                             msg="Got %s" % str(res["Value"]))
예제 #8
0
  def test_JobStateUpdateAndJobMonitoring( self ):
    """ Verifying all JobStateUpdate and JobMonitoring functions
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    # create a job and check stuff
    job = helloWorldJob()
    jobDescription = createFile( job )

    # submitting the job. Checking few stuff
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )
    jobID = int ( res['Value'] )
    # jobID = res['JobID']
    res = jobMonitor.getJobJDL( jobID, True )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobJDL( jobID, False )
    self.assert_( res['OK'] )

    # Adding stuff
    res = jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobParameters( jobID, [( 'par1', 'par1Value' ), ( 'par2', 'par2Value' )] )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobApplicationStatus( jobID, 'app status', 'source' )
    self.assert_( res['OK'] )
#     res = jobStateUpdate.setJobFlag()
#     self.assert_( res['OK'] )
#     res = jobStateUpdate.unsetJobFlag()
#     self.assert_( res['OK'] )
    res = jobStateUpdate.setJobSite( jobID, 'Site' )
    self.assert_( res['OK'] )
#     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
#     self.assert_( res['OK'] )

    # now checking few things
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Running' )
    res = jobMonitor.getJobParameter( jobID, 'par1' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value'} )
    res = jobMonitor.getJobParameters( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], {'par1': 'par1Value', 'par2': 'par2Value'} )
    res = jobMonitor.getJobAttribute( jobID, 'Site' )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Site' )
    res = jobMonitor.getJobAttributes( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['JobName'], 'helloWorld' )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    self.assertEqual( res['Value']['Status'], 'Running' )
    res = jobMonitor.getJobHeartBeatData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getInputData( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], [] )
    res = jobMonitor.getJobPrimarySummary( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getAtticJobParameters( jobID )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsStatus( [jobID], 'Done', 'MinorStatus', 'Unknown' )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobSummary( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value']['Status'], 'Done' )
    self.assertEqual( res['Value']['MinorStatus'], 'MinorStatus' )
    self.assertEqual( res['Value']['ApplicationStatus'], 'app status' )
    res = jobStateUpdate.sendHeartBeat( jobID, {'bih':'bih'}, {'boh':'boh'} )
    self.assert_( res['OK'] )


    # delete the job - this will just set its status to "deleted"
    wmsClient.deleteJob( jobID )
예제 #9
0
class TransformationCleaningAgent( AgentModule ):
  '''
  .. class:: TransformationCleaningAgent

  :param ReplicaManger replicaManager: ReplicaManager instance
  :param TransfromationClient transClient: TransfromationClient instance
  :param RequestClient requestClient: RequestClient instance
  :param FileCatalogClient metadataClient: FileCatalogClient instance

  '''

  def __init__( self, *args, **kwargs ):
    ''' c'tor
    '''
    AgentModule.__init__( self, *args, **kwargs )
    # # replica manager
    self.replicaManager = ReplicaManager()
    # # transformation client
    self.transClient = TransformationClient()
    # # wms client
    self.wmsClient = WMSClient()
    # # request client
    self.requestClient = RequestClient()
    # # file catalog clinet
    self.metadataClient = FileCatalogClient()

    # # placeholders for CS options

    # # transformations types
    self.transformationTypes = None
    # # directory locations
    self.directoryLocations = None
    # # transformation metadata
    self.transfidmeta = None
    # # archive periof in days
    self.archiveAfter = None
    # # active SEs
    self.activeStorages = None
    # # transformation log SEs
    self.logSE = None
    # # enable/disable execution
    self.enableFlag = None

  def initialize( self ):
    ''' agent initialisation

    reading and setting confing opts

    :param self: self reference
    '''
    # # shifter proxy
    self.am_setOption( 'shifterProxy', 'DataManager' )
    # # transformations types
    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = sortList( agentTSTypes )
    else:
      dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
      dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] )
      self.transformationTypes = sortList( dataProc + dataManip )
    self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) )
    # # directory locations
    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB',
                                                                                   'MetadataCatalog' ] ) )
    self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    # # transformation metadata
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    # # archive periof in days
    self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 )  # days
    self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter )
    # # active SEs
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    # # transformation log SEs
    self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' )
    self.log.info( "Will remove logs found on storage element: %s" % self.logSE )
    # # enable/disable execution, should be using CS option Status?? with default value as 'Active'??
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    return S_OK()

  #############################################################################
  def execute( self ):
    ''' execution in one agent's cycle

    :param self: self reference
    '''

    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' )
      return S_OK( 'Disabled via CS flag' )

    # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations( { 'Status' : 'Cleaning',
                                                 'Type' : self.transformationTypes } )
    if res['OK']:
      for transDict in res['Value']:
        # # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # # We just archive
        if transDict[ 'Type' ] in [ 'Replication', 'Removal' ]:
          res = self.archiveTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                         res['Message'] ) )
        else:
          res = self.cleanTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'],
                                                                        res['Message'] ) )


    # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
    res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles',
                                                 'Type' : self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        res = self.removeTransformationOutput( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )

    # # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter )
    res = self.transClient.getTransformations( { 'Status' : 'Completed',
                                                 'Type' : self.transformationTypes },
                                                 older = olderThanTime,
                                                 timeStamp = 'LastUpdate' )
    if res['OK']:
      for transDict in res['Value']:
        res = self.archiveTransformation( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )
    else:
      self.log.error( "Could not get the transformations" )

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    ''' get the directories for the supplied transformation from the transformation system

    :param self: self reference
    :param int transID: transformation ID
    '''
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        self.log.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      transDirectories = res['Value'].splitlines()
      directories = self._addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        self.log.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )

    if not directories:
      self.log.info( "No output directories found" )
    directories = sortList( directories )
    return S_OK( directories )

  @classmethod
  def _addDirs( self, transID, newDirs, existingDirs ):
    ''' append uniqe :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    '''
    for folder in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, str( folder ) ):
        if not folder in existingDirs:
          existingDirs.append( folder )
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanStorageContents( self, directory ):
    ''' delete lfn dir from all active SE

    :param self: self reference
    :param sre directory: folder name
    '''
    for storageElement in self.activeStorages:
      res = self.__removeStorageDirectory( directory, storageElement )
      if not res['OK']:
        return res
    return S_OK()

  def __removeStorageDirectory( self, directory, storageElement ):
    ''' wipe out all contents from :directory: at :storageElement:

    :param self: self reference
    :param str directory: path
    :param str storageElement: SE name
    '''
    self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) )
    res = self.replicaManager.getPfnForLfn( [directory], storageElement )
    if not res['OK']:
      self.log.error( "Failed to get PFN for directory", res['Message'] )
      return res
    for directory, error in res['Value']['Failed'].items():
      self.log.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) )
    if res['Value']['Failed']:
      return S_ERROR( 'Failed to obtain directory PFN from LFNs' )
    storageDirectory = res['Value']['Successful'].values()[0]
    res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True )
    if not res['OK']:
      self.log.error( "Failed to obtain existance of directory", res['Message'] )
      return res
    exists = res['Value']
    if not exists:
      self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) )
      return S_OK()
    res = self.replicaManager.removeStorageDirectory( storageDirectory,
                                                      storageElement,
                                                      recursive = True,
                                                      singleDirectory = True )
    if not res['OK']:
      self.log.error( "Failed to remove storage directory", res['Message'] )
      return res
    self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'],
                                                                     directory,
                                                                     storageElement ) )
    return S_OK()

  def cleanCatalogContents( self, directory ):
    ''' wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    '''
    res = self.__getCatalogDirectoryContents( [directory] )
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      return S_OK()
    self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) )
    res = self.replicaManager.removeFile( filesFound )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the catalog" )
    return S_OK()

  def __getCatalogDirectoryContents( self, directories ):
    ''' get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    '''
    self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) )
    for directory in directories:
      self.log.info( directory )
    activeDirs = directories
    allFiles = {}
    while len( activeDirs ) > 0:
      currentDir = activeDirs[0]
      res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True )
      activeDirs.remove( currentDir )
      if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ):
        self.log.info( "The supplied directory %s does not exist" % currentDir )
      elif not res['OK']:
        self.log.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) )
      else:
        dirContents = res['Value']
        activeDirs.extend( dirContents['SubDirs'] )
        allFiles.update( dirContents['Files'] )
    self.log.info( "Found %d files" % len( allFiles ) )
    return S_OK( allFiles.keys() )

  def cleanTransformationLogFiles( self, directory ):
    ''' clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    '''
    self.log.info( "Removing log files found in the directory %s" % directory )
    res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True )
    if not res['OK']:
      self.log.error( "Failed to remove log files", res['Message'] )
      return res
    self.log.info( "Successfully removed transformation log directory" )
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput( self, transID ):
    ''' This just removes any mention of the output data from the catalog and storage '''
    self.log.info( "Removing output data for transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search( '/LOG/', directory ):
        res = self.cleanCatalogContents( directory )
        if not res['OK']:
          return res
        res = self.cleanStorageContents( directory )
        if not res['OK']:
          return res
    self.log.info( "Removed directories in the catalog and storage for transformation" )
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles( transID, directories )
    if not res['OK']:
      return res
    self.log.info( "Successfully removed output of transformation %d" % transID )
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) )
    return S_OK()

  def archiveTransformation( self, transID ):
    ''' This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    '''
    self.log.info( "Archiving transformation %s" % transID )
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully archived transformation %d" % transID )
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Archived" % ( transID ) )
    return S_OK()

  def cleanTransformation( self, transID ):
    ''' This removes any mention of the supplied transformation
    '''
    self.log.info( "Cleaning transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search( '/LOG/', directory ):
        res = self.cleanTransformationLogFiles( directory )
        if not res['OK']:
          return res
      res = self.cleanCatalogContents( directory )
      if not res['OK']:
        return res
      res = self.cleanStorageContents( directory )
      if not res['OK']:
        return res
    # Clean ALL the possible remnants found in the BK
    res = self.cleanMetadataCatalogFiles( transID, directories )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully cleaned transformation %d" % transID )
    # Change the status of the transformation to deleted
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Deleted" % ( transID ) )
    return S_OK()

  def cleanMetadataCatalogFiles( self, transID ):
    ''' wipe out files from catalog '''
    res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } )
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not fileToRemove:
      self.log.info( 'No files found for transID %s' % transID )
      return S_OK()
    res = self.replicaManager.removeFile( fileToRemove )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the metadata catalog" )
    self.log.info( "Successfully removed all files found in the BK" )
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks( self, transID ):
    ''' clean tasks from WMS
    '''
    res = self.__getTransformationExternalIDs( transID )
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters( transID, ['Type'] )
      if not res['OK']:
        self.log.error( "Failed to determine transformation type" )
        return res
      transType = res['Value']
      if transType in [ 'Replication', 'Removal' ]:
        res = self.__removeRequests( externalIDs )
      else:
        res = self.__removeWMSTasks( externalIDs )
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs( self, transID ):
    ''' collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    '''
    res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } )
    if not res['OK']:
      self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] )
      return res
    externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ]
    self.log.info( "Found %d tasks for transformation" % len( externalIDs ) )
    return S_OK( externalIDs )

  def __removeRequests( self, requestIDs ):
    ''' dummy method '''
    self.log.error( "Not removing requests but should do" )
    return S_OK()

  def __removeWMSTasks( self, transJobIDs ):
    ''' wipe out jobs and their requests from the system

    TODO: should check request status, maybe FTS files as well ???

    :param self: self reference
    :param list trasnJobIDs: job IDs
    '''
    # Prevent 0 job IDs
    jobIDs = [ int( j ) for j in transJobIDs if int( j ) ]
    allRemove = True
    for jobList in breakListIntoChunks( jobIDs, 500 ):

      res = self.wmsClient.killJob( jobList )
      if res['OK']:
        self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

      res = self.wmsClient.deleteJob( jobList )
      if res['OK']:
        self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

    if not allRemove:
      return S_ERROR( "Failed to remove all remnants from WMS" )
    self.log.info( "Successfully removed all tasks from the WMS" )

    if not jobIDs:
      self.log.info( "JobIDs not present, unable to remove asociated requests." )
      return S_OK()

    res = self.requestClient.getRequestForJobs( jobIDs )
    if not res['OK']:
      self.log.error( "Failed to get requestID for jobs.", res['Message'] )
      return res
    failoverRequests = res['Value']
    self.log.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) )
    if not failoverRequests:
      return S_OK()
    failed = 0
    for jobID, requestName in failoverRequests.items():
      # Put this check just in case, tasks must have associated jobs
      if jobID == 0 or jobID == '0':
        continue
      res = self.requestClient.deleteRequest( requestName )
      if not res['OK']:
        self.log.error( "Failed to remove request from RequestDB", res['Message'] )
        failed += 1
      else:
        self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )
    if failed:
      self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) )
      self.log.info( "Failed to remove %s requests" % failed )
      return S_ERROR( "Failed to remove all the request from RequestDB" )
    self.log.info( "Successfully removed all the associated failover requests" )
    return S_OK()
예제 #10
0
class TransformationCleaningAgent( AgentModule ):
  '''
  .. class:: TransformationCleaningAgent

  :param ReplicaManger replicaManager: ReplicaManager instance
  :param TransfromationClient transClient: TransfromationClient instance
  :param RequestClient requestClient: RequestClient instance
  :param FileCatalogClient metadataClient: FileCatalogClient instance

  '''

  def __init__( self, *args, **kwargs ):
    ''' c'tor
    '''
    AgentModule.__init__( self, *args, **kwargs )
    # # replica manager
    self.replicaManager = ReplicaManager()
    # # transformation client
    self.transClient = TransformationClient()
    # # wms client
    self.wmsClient = WMSClient()
    # # request client
    self.requestClient = RequestClient()
    # # file catalog clinet
    self.metadataClient = FileCatalogClient()

    # # placeholders for CS options

    # # transformations types
    self.transformationTypes = None
    # # directory locations
    self.directoryLocations = None
    # # transformation metadata
    self.transfidmeta = None
    # # archive periof in days
    self.archiveAfter = None
    # # active SEs
    self.activeStorages = None
    # # transformation log SEs
    self.logSE = None
    # # enable/disable execution
    self.enableFlag = None

  def initialize( self ):
    ''' agent initialisation

    reading and setting confing opts

    :param self: self reference
    '''
    # # shifter proxy
    self.am_setOption( 'shifterProxy', 'DataManager' )
    # # transformations types
    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = sortList( agentTSTypes )
    else:
      dataProc = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
      dataManip = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] )
      self.transformationTypes = sortList( dataProc + dataManip )
    self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) )
    # # directory locations
    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB',
                                                                                   'MetadataCatalog' ] ) )
    self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    # # transformation metadata
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    # # archive periof in days
    self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 )  # days
    self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter )
    # # active SEs
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    # # transformation log SEs
    self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' )
    self.log.info( "Will remove logs found on storage element: %s" % self.logSE )
    # # enable/disable execution, should be using CS option Status?? with default value as 'Active'??
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    return S_OK()

  #############################################################################
  def execute( self ):
    ''' execution in one agent's cycle

    :param self: self reference
    '''

    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' )
      return S_OK( 'Disabled via CS flag' )

    # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations( { 'Status' : 'Cleaning',
                                                 'Type' : self.transformationTypes } )
    if res['OK']:
      for transDict in res['Value']:
        # # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # # We just archive
        if transDict[ 'Type' ] in [ 'Replication', 'Removal' ]:
          res = self.archiveTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                         res['Message'] ) )
        else:
          res = self.cleanTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'],
                                                                        res['Message'] ) )


    # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
    res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles',
                                                 'Type' : self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        res = self.removeTransformationOutput( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )

    # # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter )
    res = self.transClient.getTransformations( { 'Status' : 'Completed',
                                                 'Type' : self.transformationTypes },
                                                 older = olderThanTime,
                                                 timeStamp = 'LastUpdate' )
    if res['OK']:
      for transDict in res['Value']:
        res = self.archiveTransformation( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )
    else:
      self.log.error( "Could not get the transformations" )

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    ''' get the directories for the supplied transformation from the transformation system

    :param self: self reference
    :param int transID: transformation ID
    '''
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        self.log.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      transDirectories = res['Value'].splitlines()
      directories = self._addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        self.log.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )

    if not directories:
      self.log.info( "No output directories found" )
    directories = sortList( directories )
    return S_OK( directories )

  @classmethod
  def _addDirs( self, transID, newDirs, existingDirs ):
    ''' append uniqe :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    '''
    for folder in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, str( folder ) ):
        if not folder in existingDirs:
          existingDirs.append( folder )
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanStorageContents( self, directory ):
    ''' delete lfn dir from all active SE

    :param self: self reference
    :param sre directory: folder name
    '''
    for storageElement in self.activeStorages:
      res = self.__removeStorageDirectory( directory, storageElement )
      if not res['OK']:
        return res
    return S_OK()

  def __removeStorageDirectory( self, directory, storageElement ):
    ''' wipe out all contents from :directory: at :storageElement:

    :param self: self reference
    :param str directory: path
    :param str storageElement: SE name
    '''
    self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) )
    res = self.replicaManager.getPfnForLfn( [directory], storageElement )
    if not res['OK']:
      self.log.error( "Failed to get PFN for directory", res['Message'] )
      return res
    for directory, error in res['Value']['Failed'].items():
      self.log.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) )
    if res['Value']['Failed']:
      return S_ERROR( 'Failed to obtain directory PFN from LFNs' )
    storageDirectory = res['Value']['Successful'].values()[0]
    res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True )
    if not res['OK']:
      self.log.error( "Failed to obtain existance of directory", res['Message'] )
      return res
    exists = res['Value']
    if not exists:
      self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) )
      return S_OK()
    res = self.replicaManager.removeStorageDirectory( storageDirectory,
                                                      storageElement,
                                                      recursive = True,
                                                      singleDirectory = True )
    if not res['OK']:
      self.log.error( "Failed to remove storage directory", res['Message'] )
      return res
    self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'],
                                                                     directory,
                                                                     storageElement ) )
    return S_OK()

  def cleanCatalogContents( self, directory ):
    ''' wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    '''
    res = self.__getCatalogDirectoryContents( [directory] )
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      return S_OK()
    self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) )
    res = self.replicaManager.removeFile( filesFound, force = True )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the catalog" )
    return S_OK()

  def __getCatalogDirectoryContents( self, directories ):
    ''' get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    '''
    self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) )
    for directory in directories:
      self.log.info( directory )
    activeDirs = directories
    allFiles = {}
    while len( activeDirs ) > 0:
      currentDir = activeDirs[0]
      res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True )
      activeDirs.remove( currentDir )
      if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ):
        self.log.info( "The supplied directory %s does not exist" % currentDir )
      elif not res['OK']:
        self.log.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) )
      else:
        dirContents = res['Value']
        activeDirs.extend( dirContents['SubDirs'] )
        allFiles.update( dirContents['Files'] )
    self.log.info( "Found %d files" % len( allFiles ) )
    return S_OK( allFiles.keys() )

  def cleanTransformationLogFiles( self, directory ):
    ''' clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    '''
    self.log.info( "Removing log files found in the directory %s" % directory )
    res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True )
    if not res['OK']:
      self.log.error( "Failed to remove log files", res['Message'] )
      return res
    self.log.info( "Successfully removed transformation log directory" )
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput( self, transID ):
    ''' This just removes any mention of the output data from the catalog and storage '''
    self.log.info( "Removing output data for transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search( '/LOG/', directory ):
        res = self.cleanCatalogContents( directory )
        if not res['OK']:
          return res
        res = self.cleanStorageContents( directory )
        if not res['OK']:
          return res
    self.log.info( "Removed directories in the catalog and storage for transformation" )
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully removed output of transformation %d" % transID )
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) )
    return S_OK()

  def archiveTransformation( self, transID ):
    ''' This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    '''
    self.log.info( "Archiving transformation %s" % transID )
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully archived transformation %d" % transID )
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Archived" % ( transID ) )
    return S_OK()

  def cleanTransformation( self, transID ):
    ''' This removes any mention of the supplied transformation
    '''
    self.log.info( "Cleaning transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search( '/LOG/', directory ):
        res = self.cleanTransformationLogFiles( directory )
        if not res['OK']:
          return res
      res = self.cleanCatalogContents( directory )
      if not res['OK']:
        return res
      res = self.cleanStorageContents( directory )
      if not res['OK']:
        return res
    # Clean ALL the possible remnants found in the BK
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully cleaned transformation %d" % transID )
    # Change the status of the transformation to deleted
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Deleted" % ( transID ) )
    return S_OK()

  def cleanMetadataCatalogFiles( self, transID ):
    ''' wipe out files from catalog '''
    res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } )
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not fileToRemove:
      self.log.info( 'No files found for transID %s' % transID )
      return S_OK()
    res = self.replicaManager.removeFile( fileToRemove, force = True )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the metadata catalog" )
    self.log.info( "Successfully removed all files found in the BK" )
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks( self, transID ):
    ''' clean tasks from WMS
    '''
    res = self.__getTransformationExternalIDs( transID )
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters( transID, ['Type'] )
      if not res['OK']:
        self.log.error( "Failed to determine transformation type" )
        return res
      transType = res['Value']
      if transType in [ 'Replication', 'Removal' ]:
        res = self.__removeRequests( externalIDs )
      else:
        res = self.__removeWMSTasks( externalIDs )
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs( self, transID ):
    ''' collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    '''
    res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } )
    if not res['OK']:
      self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] )
      return res
    externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ]
    self.log.info( "Found %d tasks for transformation" % len( externalIDs ) )
    return S_OK( externalIDs )

  def __removeRequests( self, requestIDs ):
    ''' dummy method '''
    self.log.error( "Not removing requests but should do" )
    return S_OK()

  def __removeWMSTasks( self, transJobIDs ):
    ''' wipe out jobs and their requests from the system

    TODO: should check request status, maybe FTS files as well ???

    :param self: self reference
    :param list trasnJobIDs: job IDs
    '''
    # Prevent 0 job IDs
    jobIDs = [ int( j ) for j in transJobIDs if int( j ) ]
    allRemove = True
    for jobList in breakListIntoChunks( jobIDs, 500 ):

      res = self.wmsClient.killJob( jobList )
      if res['OK']:
        self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

      res = self.wmsClient.deleteJob( jobList )
      if res['OK']:
        self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

    if not allRemove:
      return S_ERROR( "Failed to remove all remnants from WMS" )
    self.log.info( "Successfully removed all tasks from the WMS" )

    if not jobIDs:
      self.log.info( "JobIDs not present, unable to remove asociated requests." )
      return S_OK()

    res = self.requestClient.getRequestForJobs( jobIDs )
    if not res['OK']:
      self.log.error( "Failed to get requestID for jobs.", res['Message'] )
      return res
    failoverRequests = res['Value']
    self.log.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) )
    if not failoverRequests:
      return S_OK()
    failed = 0
    for jobID, requestName in failoverRequests.items():
      # Put this check just in case, tasks must have associated jobs
      if jobID == 0 or jobID == '0':
        continue
      res = self.requestClient.deleteRequest( requestName )
      if not res['OK']:
        self.log.error( "Failed to remove request from RequestDB", res['Message'] )
        failed += 1
      else:
        self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )
    if failed:
      self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) )
      self.log.info( "Failed to remove %s requests" % failed )
      return S_ERROR( "Failed to remove all the request from RequestDB" )
    self.log.info( "Successfully removed all the associated failover requests" )
    return S_OK()
예제 #11
0
class TransformationCleaningAgent(AgentModule):
  """
  .. class:: TransformationCleaningAgent

  :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance
  :param ~TransformationClient.TransformationClient transClient: TransformationClient instance
  :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance

  """

  def __init__(self, *args, **kwargs):
    """ c'tor
    """
    AgentModule.__init__(self, *args, **kwargs)

    self.shifterProxy = None

    # # transformation client
    self.transClient = None
    # # wms client
    self.wmsClient = None
    # # request client
    self.reqClient = None
    # # file catalog client
    self.metadataClient = None

    # # transformations types
    self.transformationTypes = None
    # # directory locations
    self.directoryLocations = ['TransformationDB', 'MetadataCatalog']
    # # transformation metadata
    self.transfidmeta = 'TransformationID'
    # # archive periof in days
    self.archiveAfter = 7
    # # transformation log SEs
    self.logSE = 'LogSE'
    # # enable/disable execution
    self.enableFlag = 'True'

    self.dataProcTTypes = ['MCSimulation', 'Merge']
    self.dataManipTTypes = ['Replication', 'Removal']

  def initialize(self):
    """ agent initialisation

    reading and setting confing opts

    :param self: self reference
    """
    # # shifter proxy
    # See cleanContent method: this proxy will be used ALSO when the file catalog used
    # is the DIRAC File Catalog (DFC).
    # This is possible because of unset of the "UseServerCertificate" option
    self.shifterProxy = self.am_getOption('shifterProxy', self.shifterProxy)

    # # transformations types
    self.dataProcTTypes = Operations().getValue('Transformations/DataProcessing', self.dataProcTTypes)
    self.dataManipTTypes = Operations().getValue('Transformations/DataManipulation', self.dataManipTTypes)
    agentTSTypes = self.am_getOption('TransformationTypes', [])
    if agentTSTypes:
      self.transformationTypes = sorted(agentTSTypes)
    else:
      self.transformationTypes = sorted(self.dataProcTTypes + self.dataManipTTypes)
    self.log.info("Will consider the following transformation types: %s" % str(self.transformationTypes))
    # # directory locations
    self.directoryLocations = sorted(self.am_getOption('DirectoryLocations', self.directoryLocations))
    self.log.info("Will search for directories in the following locations: %s" % str(self.directoryLocations))
    # # transformation metadata
    self.transfidmeta = self.am_getOption('TransfIDMeta', self.transfidmeta)
    self.log.info("Will use %s as metadata tag name for TransformationID" % self.transfidmeta)
    # # archive periof in days
    self.archiveAfter = self.am_getOption('ArchiveAfter', self.archiveAfter)  # days
    self.log.info("Will archive Completed transformations after %d days" % self.archiveAfter)
    # # transformation log SEs
    self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE)
    self.log.info("Will remove logs found on storage element: %s" % self.logSE)

    # # transformation client
    self.transClient = TransformationClient()
    # # wms client
    self.wmsClient = WMSClient()
    # # request client
    self.reqClient = ReqClient()
    # # file catalog client
    self.metadataClient = FileCatalogClient()

    return S_OK()

  #############################################################################
  def execute(self):
    """ execution in one agent's cycle

    :param self: self reference
    """

    self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag)
    if self.enableFlag != 'True':
      self.log.info('TransformationCleaningAgent is disabled by configuration option EnableFlag')
      return S_OK('Disabled via CS flag')

    # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations({'Status': 'Cleaning',
                                               'Type': self.transformationTypes})
    if res['OK']:
      for transDict in res['Value']:
        if self.shifterProxy:
          self._executeClean(transDict)
        else:
          self.log.info("Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" %
                        transDict)
          executeWithUserProxy(self._executeClean)(transDict,
                                                   proxyUserDN=transDict['AuthorDN'],
                                                   proxyUserGroup=transDict['AuthorGroup'])
    else:
      self.log.error("Failed to get transformations", res['Message'])

    # Obtain the transformations in RemovingFiles status and removes the output files
    res = self.transClient.getTransformations({'Status': 'RemovingFiles',
                                               'Type': self.transformationTypes})
    if res['OK']:
      for transDict in res['Value']:
        if self.shifterProxy:
          self._executeRemoval(transDict)
        else:
          self.log.info("Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" %
                        transDict)
          executeWithUserProxy(self._executeRemoval)(transDict,
                                                     proxyUserDN=transDict['AuthorDN'],
                                                     proxyUserGroup=transDict['AuthorGroup'])
    else:
      self.log.error("Could not get the transformations", res['Message'])

    # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
    res = self.transClient.getTransformations({'Status': 'Completed',
                                               'Type': self.transformationTypes},
                                              older=olderThanTime,
                                              timeStamp='LastUpdate')
    if res['OK']:
      for transDict in res['Value']:
        if self.shifterProxy:
          self._executeArchive(transDict)
        else:
          self.log.info("Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s" %
                        transDict)
          executeWithUserProxy(self._executeArchive)(transDict,
                                                     proxyUserDN=transDict['AuthorDN'],
                                                     proxyUserGroup=transDict['AuthorGroup'])
    else:
      self.log.error("Could not get the transformations", res['Message'])
    return S_OK()

  def _executeClean(self, transDict):
    """Clean transformation."""
    # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
    # We just archive
    if transDict['Type'] in self.dataManipTTypes:
      res = self.archiveTransformation(transDict['TransformationID'])
      if not res['OK']:
        self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'],
                                                                     res['Message']))
    else:
      res = self.cleanTransformation(transDict['TransformationID'])
      if not res['OK']:
        self.log.error("Problems cleaning transformation %s: %s" % (transDict['TransformationID'],
                                                                    res['Message']))

  def _executeRemoval(self, transDict):
    """Remove files from given transformation."""
    res = self.removeTransformationOutput(transDict['TransformationID'])
    if not res['OK']:
      self.log.error("Problems removing transformation %s: %s" % (transDict['TransformationID'],
                                                                  res['Message']))

  def _executeArchive(self, transDict):
    """Archive the given transformation."""
    res = self.archiveTransformation(transDict['TransformationID'])
    if not res['OK']:
      self.log.error("Problems archiving transformation %s: %s" % (transDict['TransformationID'],
                                                                   res['Message']))

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories(self, transID):
    """ get the directories for the supplied transformation from the transformation system.
        These directories are used by removeTransformationOutput and cleanTransformation for removing output.

    :param self: self reference
    :param int transID: transformation ID
    """
    self.log.verbose("Cleaning Transformation directories of transformation %d" % transID)
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters(transID, ['OutputDirectories'])
      if not res['OK']:
        self.log.error("Failed to obtain transformation directories", res['Message'])
        return res
      transDirectories = []
      if res['Value']:
        if not isinstance(res['Value'], list):
          try:
            transDirectories = ast.literal_eval(res['Value'])
          except BaseException:
            # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]'
            transDirectories.append(res['Value'])
        else:
          transDirectories = res['Value']
      directories = self._addDirs(transID, transDirectories, directories)

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata({self.transfidmeta: transID})
      if not res['OK']:
        self.log.error("Failed to obtain metadata catalog directories", res['Message'])
        return res
      transDirectories = res['Value']
      directories = self._addDirs(transID, transDirectories, directories)

    if not directories:
      self.log.info("No output directories found")
    directories = sorted(directories)
    return S_OK(directories)

  @classmethod
  def _addDirs(cls, transID, newDirs, existingDirs):
    """ append unique :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    """
    for folder in newDirs:
      transStr = str(transID).zfill(8)
      if re.search(transStr, str(folder)):
        if folder not in existingDirs:
          existingDirs.append(os.path.normpath(folder))
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanContent(self, directory):
    """ wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    """
    self.log.verbose("Cleaning Catalog contents")
    res = self.__getCatalogDirectoryContents([directory])
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      self.log.info("No files are registered in the catalog directory %s" % directory)
      return S_OK()
    self.log.info("Attempting to remove %d possible remnants from the catalog and storage" % len(filesFound))

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false')
    res = DataManager().removeFile(filesFound, force=True)
    gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true')

    if not res['OK']:
      return res
    realFailure = False
    for lfn, reason in res['Value']['Failed'].items():
      if "File does not exist" in str(reason):
        self.log.warn("File %s not found in some catalog: " % (lfn))
      else:
        self.log.error("Failed to remove file found in the catalog", "%s %s" % (lfn, reason))
        realFailure = True
    if realFailure:
      return S_ERROR("Failed to remove all files found in the catalog")
    return S_OK()

  def __getCatalogDirectoryContents(self, directories):
    """ get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    """
    self.log.info('Obtaining the catalog contents for %d directories:' % len(directories))
    for directory in directories:
      self.log.info(directory)
    activeDirs = directories
    allFiles = {}
    fc = FileCatalog()
    while activeDirs:
      currentDir = activeDirs[0]
      res = returnSingleResult(fc.listDirectory(currentDir))
      activeDirs.remove(currentDir)
      if not res['OK'] and 'Directory does not exist' in res['Message']:  # FIXME: DFC should return errno
        self.log.info("The supplied directory %s does not exist" % currentDir)
      elif not res['OK']:
        if "No such file or directory" in res['Message']:
          self.log.info("%s: %s" % (currentDir, res['Message']))
        else:
          self.log.error("Failed to get directory %s content: %s" % (currentDir, res['Message']))
      else:
        dirContents = res['Value']
        activeDirs.extend(dirContents['SubDirs'])
        allFiles.update(dirContents['Files'])
    self.log.info("Found %d files" % len(allFiles))
    return S_OK(allFiles.keys())

  def cleanTransformationLogFiles(self, directory):
    """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
    self.log.verbose("Removing log files found in the directory %s" % directory)
    res = returnSingleResult(StorageElement(self.logSE).removeDirectory(directory, recursive=True))
    if not res['OK']:
      if cmpError(res, errno.ENOENT):  # No such file or directory
        self.log.warn("Transformation log directory does not exist", directory)
        return S_OK()
      self.log.error("Failed to remove log files", res['Message'])
      return res
    self.log.info("Successfully removed transformation log directory")
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput(self, transID):
    """ This just removes any mention of the output data from the catalog and storage """
    self.log.info("Removing output data for transformation %s" % transID)
    res = self.getTransformationDirectories(transID)
    if not res['OK']:
      self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res))
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search('/LOG/', directory):
        res = self.cleanContent(directory)
        if not res['OK']:
          return res

    self.log.info("Removed %d directories from the catalog \
      and its files from the storage for transformation %s" % (len(directories), transID))
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles(transID)
    if not res['OK']:
      return res
    self.log.info("Successfully removed output of transformation %d" % transID)
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter(transID, 'Status', 'RemovedFiles')
    if not res['OK']:
      self.log.error("Failed to update status of transformation %s to RemovedFiles" % (transID), res['Message'])
      return res
    self.log.info("Updated status of transformation %s to RemovedFiles" % (transID))
    return S_OK()

  def archiveTransformation(self, transID):
    """ This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    """
    self.log.info("Archiving transformation %s" % transID)
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks(transID)
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation(transID)
    if not res['OK']:
      return res
    self.log.info("Successfully archived transformation %d" % transID)
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter(transID, 'Status', 'Archived')
    if not res['OK']:
      self.log.error("Failed to update status of transformation %s to Archived" % (transID), res['Message'])
      return res
    self.log.info("Updated status of transformation %s to Archived" % (transID))
    return S_OK()

  def cleanTransformation(self, transID):
    """ This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
    """
    self.log.info("Cleaning transformation %s" % transID)
    res = self.getTransformationDirectories(transID)
    if not res['OK']:
      self.log.error('Problem obtaining directories for transformation %s with result "%s"' % (transID, res))
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks(transID)
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search('/LOG/', directory):
        res = self.cleanTransformationLogFiles(directory)
        if not res['OK']:
          return res
      res = self.cleanContent(directory)
      if not res['OK']:
        return res

    # Clean ALL the possible remnants found
    res = self.cleanMetadataCatalogFiles(transID)
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation(transID)
    if not res['OK']:
      return res
    self.log.info("Successfully cleaned transformation %d" % transID)
    res = self.transClient.setTransformationParameter(transID, 'Status', 'Cleaned')
    if not res['OK']:
      self.log.error("Failed to update status of transformation %s to Cleaned" % (transID), res['Message'])
      return res
    self.log.info("Updated status of transformation %s to Cleaned" % (transID))
    return S_OK()

  def cleanMetadataCatalogFiles(self, transID):
    """ wipe out files from catalog """
    res = self.metadataClient.findFilesByMetadata({self.transfidmeta: transID})
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not fileToRemove:
      self.log.info('No files found for transID %s' % transID)
      return S_OK()

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'false')
    res = DataManager().removeFile(fileToRemove, force=True)
    gConfigurationData.setOptionInCFG('/DIRAC/Security/UseServerCertificate', 'true')

    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error("Failed to remove file found in metadata catalog", "%s %s" % (lfn, reason))
    if res['Value']['Failed']:
      return S_ERROR("Failed to remove all files found in the metadata catalog")
    self.log.info("Successfully removed all files found in the BK")
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks(self, transID):
    """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation
    """
    self.log.verbose("Cleaning Transformation tasks of transformation %d" % transID)
    res = self.__getTransformationExternalIDs(transID)
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters(transID, ['Type'])
      if not res['OK']:
        self.log.error("Failed to determine transformation type")
        return res
      transType = res['Value']
      if transType in self.dataProcTTypes:
        res = self.__removeWMSTasks(externalIDs)
      else:
        res = self.__removeRequests(externalIDs)
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs(self, transID):
    """ collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    """
    res = self.transClient.getTransformationTasks(condDict={'TransformationID': transID})
    if not res['OK']:
      self.log.error("Failed to get externalIDs for transformation %d" % transID, res['Message'])
      return res
    externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]]
    self.log.info("Found %d tasks for transformation" % len(externalIDs))
    return S_OK(externalIDs)

  def __removeRequests(self, requestIDs):
    """ This will remove requests from the RMS system -
    """
    rIDs = [int(long(j)) for j in requestIDs if long(j)]
    for reqID in rIDs:
      self.reqClient.cancelRequest(reqID)

    return S_OK()

  def __removeWMSTasks(self, transJobIDs):
    """ wipe out jobs and their requests from the system

    :param self: self reference
    :param list trasnJobIDs: job IDs
    """
    # Prevent 0 job IDs
    jobIDs = [int(j) for j in transJobIDs if int(j)]
    allRemove = True
    for jobList in breakListIntoChunks(jobIDs, 500):

      res = self.wmsClient.killJob(jobList)
      if res['OK']:
        self.log.info("Successfully killed %d jobs from WMS" % len(jobList))
      elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
        self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs']))
      elif "NonauthorizedJobIDs" in res:
        self.log.error("Failed to kill %s jobs because not authorized" % len(res['NonauthorizedJobIDs']))
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error("Failed to kill %s jobs" % len(res['FailedJobIDs']))
        allRemove = False

      res = self.wmsClient.deleteJob(jobList)
      if res['OK']:
        self.log.info("Successfully removed %d jobs from WMS" % len(jobList))
      elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
        self.log.info("Found %s jobs which did not exist in the WMS" % len(res['InvalidJobIDs']))
      elif "NonauthorizedJobIDs" in res:
        self.log.error("Failed to remove %s jobs because not authorized" % len(res['NonauthorizedJobIDs']))
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error("Failed to remove %s jobs" % len(res['FailedJobIDs']))
        allRemove = False

    if not allRemove:
      return S_ERROR("Failed to remove all remnants from WMS")
    self.log.info("Successfully removed all tasks from the WMS")

    if not jobIDs:
      self.log.info("JobIDs not present, unable to remove asociated requests.")
      return S_OK()

    failed = 0
    failoverRequests = {}
    res = self.reqClient.getRequestIDsForJobs(jobIDs)
    if not res['OK']:
      self.log.error("Failed to get requestID for jobs.", res['Message'])
      return res
    failoverRequests.update(res['Value']['Successful'])
    if not failoverRequests:
      return S_OK()
    for jobID, requestID in res['Value']['Successful'].items():
      # Put this check just in case, tasks must have associated jobs
      if jobID == 0 or jobID == '0':
        continue
      res = self.reqClient.cancelRequest(requestID)
      if not res['OK']:
        self.log.error("Failed to remove request from RequestDB", res['Message'])
        failed += 1
      else:
        self.log.verbose("Removed request %s associated to job %d." % (requestID, jobID))

    if failed:
      self.log.info("Successfully removed %s requests" % (len(failoverRequests) - failed))
      self.log.info("Failed to remove %s requests" % failed)
      return S_ERROR("Failed to remove all the request from RequestDB")
    self.log.info("Successfully removed all the associated failover requests")
    return S_OK()
예제 #12
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """# Now, let's submit some jobs. Different sites, types, inputs"""
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [["/a/1.txt", "/a/2.txt"],
                 ["/a/1.txt", "/a/3.txt", "/a/4.txt"], []]
        types = ["User", "Test"]
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination("DIRAC.Jenkins.ch")
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res["OK"], res.get("Message"))
                jobID = res["Value"]
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set(res["Value"]) <= {"ANY", "DIRAC.Jenkins.ch", "Site"},
            msg="Got %s" % res["Value"])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(sorted(res["Value"]),
                         sorted(types),
                         msg="Got %s" % str(sorted(res["Value"])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], ["app status", "Unknown"],
                         msg="Got %s" % str(res["Value"]))

        res = jobMonitor.getOwners()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_empty = res["Value"]
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanNow = res["Value"]
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res["OK"], res.get("Message"))
        resJG_olderThanOneYear = res["Value"]
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)),
            resJG_olderThanOneYear)
        res = jobMonitor.getStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"])
            in [[JobStatus.RECEIVED],
                sorted([JobStatus.RECEIVED, JobStatus.KILLED])], res["Value"])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            sorted(res["Value"]) in [
                ["Job accepted"],
                sorted(["Job accepted", "Job Rescheduled"]),
                sorted(["Job accepted", "Marked for termination"]),
            ],
            res["Value"],
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobs()
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(
            set([str(x) for x in jobIDs]) <= set(res["Value"]), res["Value"])
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow()): {
                    "Status": JobStatus.CHECKING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                }
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.CHECKING)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus")

        res = jobStateUpdate.setJobStatusBulk(
            jobID,
            {
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=1)):
                {
                    "Status": JobStatus.WAITING,
                    "MinorStatus": "MinorStatus",
                    "Source": "Unknown",
                },
                str(datetime.datetime.utcnow() + datetime.timedelta(hours=2)):
                {
                    "Status": JobStatus.MATCHED,
                    "MinorStatus": "MinorStatus-matched",
                    "Source": "Unknown",
                },
            },
            False,
        )
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobsParameter({jobID: ["Whatever", "booh"]})
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.MATCHED)
        self.assertEqual(res["Value"]["MinorStatus"], "MinorStatus-matched")

        res = jobStateUpdate.setJobAttribute(jobID, "Status",
                                             JobStatus.RUNNING)
        self.assertTrue(res["OK"], res.get("Message"))

        res = jobMonitor.getJobSummary(int(jobID))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"], JobStatus.RUNNING)

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
예제 #13
0
  def test_FullChain( self ):
    """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    # create the job
    job = helloWorldJob()
    jobDescription = createFile( job )

    # submit the job
    res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
    self.assert_( res['OK'] )
  # self.assertEqual( type( res['Value'] ), int )
  # self.assertEqual( res['Value'], res['JobID'] )
  # jobID = res['JobID']
    jobID = res['Value']

    # updating the status
    jobStateUpdate.setJobStatus( jobID, 'Running', 'Executing Minchiapp', 'source' )

    # reset the job
    res = wmsClient.resetJob( jobID )
    self.assert_( res['OK'] )

    # reschedule the job
    res = wmsClient.rescheduleJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Received' )

    # updating the status again
    jobStateUpdate.setJobStatus( jobID, 'Matched', 'matching', 'source' )

    # kill the job
    res = wmsClient.killJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Killed' )

    # updating the status aaaagain
    jobStateUpdate.setJobStatus( jobID, 'Done', 'matching', 'source' )

    # kill the job
    res = wmsClient.killJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Done' )  # this time it won't kill... it's done!

    # delete the job - this will just set its status to "deleted"
    res = wmsClient.deleteJob( jobID )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobStatus( jobID )
    self.assert_( res['OK'] )
    self.assertEqual( res['Value'], 'Deleted' )
예제 #14
0
class TransformationCleaningAgent(AgentModule):
    """
  .. class:: TransformationCleaningAgent

  :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance
  :param ~TransformationClient.TransformationClient transClient: TransformationClient instance
  :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance

  """
    def __init__(self, *args, **kwargs):
        """ c'tor
    """
        AgentModule.__init__(self, *args, **kwargs)

        self.shifterProxy = None

        # # transformation client
        self.transClient = None
        # # wms client
        self.wmsClient = None
        # # request client
        self.reqClient = None
        # # file catalog client
        self.metadataClient = None

        # # transformations types
        self.transformationTypes = None
        # # directory locations
        self.directoryLocations = ['TransformationDB', 'MetadataCatalog']
        # # transformation metadata
        self.transfidmeta = 'TransformationID'
        # # archive periof in days
        self.archiveAfter = 7
        # # transformation log SEs
        self.logSE = 'LogSE'
        # # enable/disable execution
        self.enableFlag = 'True'

        self.dataProcTTypes = ['MCSimulation', 'Merge']
        self.dataManipTTypes = ['Replication', 'Removal']

    def initialize(self):
        """ agent initialisation

    reading and setting confing opts

    :param self: self reference
    """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption('shifterProxy',
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            'Transformations/DataProcessing', self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            'Transformations/DataManipulation', self.dataManipTTypes)
        agentTSTypes = self.am_getOption('TransformationTypes', [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption('DirectoryLocations', self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption('TransfIDMeta',
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption('ArchiveAfter',
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue('/LogStorage/LogSE', self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """ execution in one agent's cycle

    :param self: self reference
    """

        self.enableFlag = self.am_getOption('EnableFlag', self.enableFlag)
        if self.enableFlag != 'True':
            self.log.info(
                'TransformationCleaningAgent is disabled by configuration option EnableFlag'
            )
            return S_OK('Disabled via CS flag')

        # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
        res = self.transClient.getTransformations({
            'Status':
            'Cleaning',
            'Type':
            self.transformationTypes
        })
        if res['OK']:
            for transDict in res['Value']:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict['AuthorDN'],
                        proxyUserGroup=transDict['AuthorGroup'])
        else:
            self.log.error("Failed to get transformations", res['Message'])

        # Obtain the transformations in RemovingFiles status and removes the output files
        res = self.transClient.getTransformations({
            'Status':
            'RemovingFiles',
            'Type':
            self.transformationTypes
        })
        if res['OK']:
            for transDict in res['Value']:
                if self.shifterProxy:
                    self._executeRemoval(transDict)
                else:
                    self.log.info(
                        "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeRemoval)(
                        transDict,
                        proxyUserDN=transDict['AuthorDN'],
                        proxyUserGroup=transDict['AuthorGroup'])
        else:
            self.log.error("Could not get the transformations", res['Message'])

        # Obtain the transformations in Completed status and archive if inactive for X days
        olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
        res = self.transClient.getTransformations(
            {
                'Status': 'Completed',
                'Type': self.transformationTypes
            },
            older=olderThanTime,
            timeStamp='LastUpdate')
        if res['OK']:
            for transDict in res['Value']:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict['AuthorDN'],
                        proxyUserGroup=transDict['AuthorGroup'])
        else:
            self.log.error("Could not get the transformations", res['Message'])
        return S_OK()

    def _executeClean(self, transDict):
        """Clean transformation."""
        # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # We just archive
        if transDict['Type'] in self.dataManipTTypes:
            res = self.archiveTransformation(transDict['TransformationID'])
            if not res['OK']:
                self.log.error("Problems archiving transformation %s: %s" %
                               (transDict['TransformationID'], res['Message']))
        else:
            res = self.cleanTransformation(transDict['TransformationID'])
            if not res['OK']:
                self.log.error("Problems cleaning transformation %s: %s" %
                               (transDict['TransformationID'], res['Message']))

    def _executeRemoval(self, transDict):
        """Remove files from given transformation."""
        res = self.removeTransformationOutput(transDict['TransformationID'])
        if not res['OK']:
            self.log.error("Problems removing transformation %s: %s" %
                           (transDict['TransformationID'], res['Message']))

    def _executeArchive(self, transDict):
        """Archive the given transformation."""
        res = self.archiveTransformation(transDict['TransformationID'])
        if not res['OK']:
            self.log.error("Problems archiving transformation %s: %s" %
                           (transDict['TransformationID'], res['Message']))

        return S_OK()

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """ get the directories for the supplied transformation from the transformation system.
        These directories are used by removeTransformationOutput and cleanTransformation for removing output.

    :param self: self reference
    :param int transID: transformation ID
    """
        self.log.verbose(
            "Cleaning Transformation directories of transformation %d" %
            transID)
        directories = []
        if 'TransformationDB' in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ['OutputDirectories'])
            if not res['OK']:
                self.log.error("Failed to obtain transformation directories",
                               res['Message'])
                return res
            transDirectories = []
            if res['Value']:
                if not isinstance(res['Value'], list):
                    try:
                        transDirectories = ast.literal_eval(res['Value'])
                    except BaseException:
                        # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]'
                        transDirectories.append(res['Value'])
                else:
                    transDirectories = res['Value']
            directories = self._addDirs(transID, transDirectories, directories)

        if 'MetadataCatalog' in self.directoryLocations:
            res = self.metadataClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res['OK']:
                self.log.error("Failed to obtain metadata catalog directories",
                               res['Message'])
                return res
            transDirectories = res['Value']
            directories = self._addDirs(transID, transDirectories, directories)

        if not directories:
            self.log.info("No output directories found")
        directories = sorted(directories)
        return S_OK(directories)

    @classmethod
    def _addDirs(cls, transID, newDirs, existingDirs):
        """ append unique :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    """
        for folder in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, str(folder)):
                if folder not in existingDirs:
                    existingDirs.append(os.path.normpath(folder))
        return existingDirs

    #############################################################################
    #
    # These are the methods for performing the cleaning of catalogs and storage
    #

    def cleanContent(self, directory):
        """ wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    """
        self.log.verbose("Cleaning Catalog contents")
        res = self.__getCatalogDirectoryContents([directory])
        if not res['OK']:
            return res
        filesFound = res['Value']
        if not filesFound:
            self.log.info(
                "No files are registered in the catalog directory %s" %
                directory)
            return S_OK()
        self.log.info(
            "Attempting to remove %d possible remnants from the catalog and storage"
            % len(filesFound))

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            '/DIRAC/Security/UseServerCertificate', 'false')
        res = DataManager().removeFile(filesFound, force=True)
        gConfigurationData.setOptionInCFG(
            '/DIRAC/Security/UseServerCertificate', 'true')

        if not res['OK']:
            return res
        realFailure = False
        for lfn, reason in res['Value']['Failed'].items():
            if "File does not exist" in str(reason):
                self.log.warn("File %s not found in some catalog: " % (lfn))
            else:
                self.log.error("Failed to remove file found in the catalog",
                               "%s %s" % (lfn, reason))
                realFailure = True
        if realFailure:
            return S_ERROR("Failed to remove all files found in the catalog")
        return S_OK()

    def __getCatalogDirectoryContents(self, directories):
        """ get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    """
        self.log.info('Obtaining the catalog contents for %d directories:' %
                      len(directories))
        for directory in directories:
            self.log.info(directory)
        activeDirs = directories
        allFiles = {}
        fc = FileCatalog()
        while activeDirs:
            currentDir = activeDirs[0]
            res = returnSingleResult(fc.listDirectory(currentDir))
            activeDirs.remove(currentDir)
            if not res['OK'] and 'Directory does not exist' in res[
                    'Message']:  # FIXME: DFC should return errno
                self.log.info("The supplied directory %s does not exist" %
                              currentDir)
            elif not res['OK']:
                if "No such file or directory" in res['Message']:
                    self.log.info("%s: %s" % (currentDir, res['Message']))
                else:
                    self.log.error("Failed to get directory %s content: %s" %
                                   (currentDir, res['Message']))
            else:
                dirContents = res['Value']
                activeDirs.extend(dirContents['SubDirs'])
                allFiles.update(dirContents['Files'])
        self.log.info("Found %d files" % len(allFiles))
        return S_OK(allFiles.keys())

    def cleanTransformationLogFiles(self, directory):
        """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
        self.log.verbose("Removing log files found in the directory %s" %
                         directory)
        res = returnSingleResult(
            StorageElement(self.logSE).removeDirectory(directory,
                                                       recursive=True))
        if not res['OK']:
            if cmpError(res, errno.ENOENT):  # No such file or directory
                self.log.warn("Transformation log directory does not exist",
                              directory)
                return S_OK()
            self.log.error("Failed to remove log files", res['Message'])
            return res
        self.log.info("Successfully removed transformation log directory")
        return S_OK()

    #############################################################################
    #
    # These are the functional methods for archiving and cleaning transformations
    #

    def removeTransformationOutput(self, transID):
        """ This just removes any mention of the output data from the catalog and storage """
        self.log.info("Removing output data for transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res['OK']:
            self.log.error(
                'Problem obtaining directories for transformation %s with result "%s"'
                % (transID, res))
            return S_OK()
        directories = res['Value']
        for directory in directories:
            if not re.search('/LOG/', directory):
                res = self.cleanContent(directory)
                if not res['OK']:
                    return res

        self.log.info("Removed %d directories from the catalog \
      and its files from the storage for transformation %s" %
                      (len(directories), transID))
        # Clean ALL the possible remnants found in the metadata catalog
        res = self.cleanMetadataCatalogFiles(transID)
        if not res['OK']:
            return res
        self.log.info("Successfully removed output of transformation %d" %
                      transID)
        # Change the status of the transformation to RemovedFiles
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'RemovedFiles')
        if not res['OK']:
            self.log.error(
                "Failed to update status of transformation %s to RemovedFiles"
                % (transID), res['Message'])
            return res
        self.log.info("Updated status of transformation %s to RemovedFiles" %
                      (transID))
        return S_OK()

    def archiveTransformation(self, transID):
        """ This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    """
        self.log.info("Archiving transformation %s" % transID)
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res['OK']:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res['OK']:
            return res
        self.log.info("Successfully archived transformation %d" % transID)
        # Change the status of the transformation to archived
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'Archived')
        if not res['OK']:
            self.log.error(
                "Failed to update status of transformation %s to Archived" %
                (transID), res['Message'])
            return res
        self.log.info("Updated status of transformation %s to Archived" %
                      (transID))
        return S_OK()

    def cleanTransformation(self, transID):
        """ This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
    """
        self.log.info("Cleaning transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res['OK']:
            self.log.error(
                'Problem obtaining directories for transformation %s with result "%s"'
                % (transID, res))
            return S_OK()
        directories = res['Value']
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res['OK']:
            return res
        # Clean the log files for the jobs
        for directory in directories:
            if re.search('/LOG/', directory):
                res = self.cleanTransformationLogFiles(directory)
                if not res['OK']:
                    return res
            res = self.cleanContent(directory)
            if not res['OK']:
                return res

        # Clean ALL the possible remnants found
        res = self.cleanMetadataCatalogFiles(transID)
        if not res['OK']:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res['OK']:
            return res
        self.log.info("Successfully cleaned transformation %d" % transID)
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'Cleaned')
        if not res['OK']:
            self.log.error(
                "Failed to update status of transformation %s to Cleaned" %
                (transID), res['Message'])
            return res
        self.log.info("Updated status of transformation %s to Cleaned" %
                      (transID))
        return S_OK()

    def cleanMetadataCatalogFiles(self, transID):
        """ wipe out files from catalog """
        res = self.metadataClient.findFilesByMetadata(
            {self.transfidmeta: transID})
        if not res['OK']:
            return res
        fileToRemove = res['Value']
        if not fileToRemove:
            self.log.info('No files found for transID %s' % transID)
            return S_OK()

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            '/DIRAC/Security/UseServerCertificate', 'false')
        res = DataManager().removeFile(fileToRemove, force=True)
        gConfigurationData.setOptionInCFG(
            '/DIRAC/Security/UseServerCertificate', 'true')

        if not res['OK']:
            return res
        for lfn, reason in res['Value']['Failed'].items():
            self.log.error("Failed to remove file found in metadata catalog",
                           "%s %s" % (lfn, reason))
        if res['Value']['Failed']:
            return S_ERROR(
                "Failed to remove all files found in the metadata catalog")
        self.log.info("Successfully removed all files found in the BK")
        return S_OK()

    #############################################################################
    #
    # These are the methods for removing the jobs from the WMS and transformation DB
    #

    def cleanTransformationTasks(self, transID):
        """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation
    """
        self.log.verbose("Cleaning Transformation tasks of transformation %d" %
                         transID)
        res = self.__getTransformationExternalIDs(transID)
        if not res['OK']:
            return res
        externalIDs = res['Value']
        if externalIDs:
            res = self.transClient.getTransformationParameters(
                transID, ['Type'])
            if not res['OK']:
                self.log.error("Failed to determine transformation type")
                return res
            transType = res['Value']
            if transType in self.dataProcTTypes:
                res = self.__removeWMSTasks(externalIDs)
            else:
                res = self.__removeRequests(externalIDs)
            if not res['OK']:
                return res
        return S_OK()

    def __getTransformationExternalIDs(self, transID):
        """ collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    """
        res = self.transClient.getTransformationTasks(
            condDict={'TransformationID': transID})
        if not res['OK']:
            self.log.error(
                "Failed to get externalIDs for transformation %d" % transID,
                res['Message'])
            return res
        externalIDs = [taskDict['ExternalID'] for taskDict in res["Value"]]
        self.log.info("Found %d tasks for transformation" % len(externalIDs))
        return S_OK(externalIDs)

    def __removeRequests(self, requestIDs):
        """ This will remove requests from the RMS system -
    """
        rIDs = [int(long(j)) for j in requestIDs if long(j)]
        for reqID in rIDs:
            self.reqClient.cancelRequest(reqID)

        return S_OK()

    def __removeWMSTasks(self, transJobIDs):
        """ wipe out jobs and their requests from the system

    :param self: self reference
    :param list trasnJobIDs: job IDs
    """
        # Prevent 0 job IDs
        jobIDs = [int(j) for j in transJobIDs if int(j)]
        allRemove = True
        for jobList in breakListIntoChunks(jobIDs, 500):

            res = self.wmsClient.killJob(jobList)
            if res['OK']:
                self.log.info("Successfully killed %d jobs from WMS" %
                              len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found %s jobs which did not exist in the WMS" %
                              len(res['InvalidJobIDs']))
            elif "NonauthorizedJobIDs" in res:
                self.log.error(
                    "Failed to kill %s jobs because not authorized" %
                    len(res['NonauthorizedJobIDs']))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to kill %s jobs" %
                               len(res['FailedJobIDs']))
                allRemove = False

            res = self.wmsClient.deleteJob(jobList)
            if res['OK']:
                self.log.info("Successfully removed %d jobs from WMS" %
                              len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found %s jobs which did not exist in the WMS" %
                              len(res['InvalidJobIDs']))
            elif "NonauthorizedJobIDs" in res:
                self.log.error(
                    "Failed to remove %s jobs because not authorized" %
                    len(res['NonauthorizedJobIDs']))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to remove %s jobs" %
                               len(res['FailedJobIDs']))
                allRemove = False

        if not allRemove:
            return S_ERROR("Failed to remove all remnants from WMS")
        self.log.info("Successfully removed all tasks from the WMS")

        if not jobIDs:
            self.log.info(
                "JobIDs not present, unable to remove asociated requests.")
            return S_OK()

        failed = 0
        failoverRequests = {}
        res = self.reqClient.getRequestIDsForJobs(jobIDs)
        if not res['OK']:
            self.log.error("Failed to get requestID for jobs.", res['Message'])
            return res
        failoverRequests.update(res['Value']['Successful'])
        if not failoverRequests:
            return S_OK()
        for jobID, requestID in res['Value']['Successful'].items():
            # Put this check just in case, tasks must have associated jobs
            if jobID == 0 or jobID == '0':
                continue
            res = self.reqClient.cancelRequest(requestID)
            if not res['OK']:
                self.log.error("Failed to remove request from RequestDB",
                               res['Message'])
                failed += 1
            else:
                self.log.verbose("Removed request %s associated to job %d." %
                                 (requestID, jobID))

        if failed:
            self.log.info("Successfully removed %s requests" %
                          (len(failoverRequests) - failed))
            self.log.info("Failed to remove %s requests" % failed)
            return S_ERROR("Failed to remove all the request from RequestDB")
        self.log.info(
            "Successfully removed all the associated failover requests")
        return S_OK()
예제 #15
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(isinstance(res['Value'], int),
                        msg="Got %s" % type(res['Value']))
        self.assertEqual(res['Value'],
                         res['JobID'],
                         msg="Got %s, expected %s" %
                         (str(res['Value']), res['JobID']))
        jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        res = jobStateUpdate.setJobStatus(jobID, 'Running',
                                          'Executing Minchiapp', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Received',
                         msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsMinorStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'MinorStatus': 'Job Rescheduled',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))
        res = jobMonitor.getJobsApplicationStatus([jobID])
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'],
            {jobID: {
                'ApplicationStatus': 'Unknown',
                'JobID': jobID
            }},
            msg="Got %s" % str(res['Value']))

        # updating the status again
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Killed',
                         msg="Got %s" % str(res['Value']))

        # updating the status aaaagain
        res = jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')
        self.assertTrue(res['OK'], res.get('Message'))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(
            res['Value'], 'Done', msg="Got %s" %
            str(res['Value']))  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobStatus(jobID)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(res['Value'],
                         'Deleted',
                         msg="Got %s" % str(res['Value']))
예제 #16
0
class TransformationCleaningAgent(AgentModule):
    """
    .. class:: TransformationCleaningAgent

    :param ~DIRAC.DataManagementSystem.Client.DataManager.DataManager dm: DataManager instance
    :param ~TransformationClient.TransformationClient transClient: TransformationClient instance
    :param ~FileCatalogClient.FileCatalogClient metadataClient: FileCatalogClient instance

    """
    def __init__(self, *args, **kwargs):
        """c'tor"""
        AgentModule.__init__(self, *args, **kwargs)

        self.shifterProxy = None

        # # transformation client
        self.transClient = None
        # # wms client
        self.wmsClient = None
        # # request client
        self.reqClient = None
        # # file catalog client
        self.metadataClient = None

        # # transformations types
        self.transformationTypes = None
        # # directory locations
        self.directoryLocations = ["TransformationDB", "MetadataCatalog"]
        # # transformation metadata
        self.transfidmeta = "TransformationID"
        # # archive periof in days
        self.archiveAfter = 7
        # # transformation log SEs
        self.logSE = "LogSE"
        # # enable/disable execution
        self.enableFlag = "True"

        self.dataProcTTypes = ["MCSimulation", "Merge"]
        self.dataManipTTypes = ["Replication", "Removal"]

    def initialize(self):
        """agent initialisation

        reading and setting config opts

        :param self: self reference
        """
        # # shifter proxy
        # See cleanContent method: this proxy will be used ALSO when the file catalog used
        # is the DIRAC File Catalog (DFC).
        # This is possible because of unset of the "UseServerCertificate" option
        self.shifterProxy = self.am_getOption("shifterProxy",
                                              self.shifterProxy)

        # # transformations types
        self.dataProcTTypes = Operations().getValue(
            "Transformations/DataProcessing", self.dataProcTTypes)
        self.dataManipTTypes = Operations().getValue(
            "Transformations/DataManipulation", self.dataManipTTypes)
        agentTSTypes = self.am_getOption("TransformationTypes", [])
        if agentTSTypes:
            self.transformationTypes = sorted(agentTSTypes)
        else:
            self.transformationTypes = sorted(self.dataProcTTypes +
                                              self.dataManipTTypes)
        self.log.info("Will consider the following transformation types: %s" %
                      str(self.transformationTypes))
        # # directory locations
        self.directoryLocations = sorted(
            self.am_getOption("DirectoryLocations", self.directoryLocations))
        self.log.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        # # transformation metadata
        self.transfidmeta = self.am_getOption("TransfIDMeta",
                                              self.transfidmeta)
        self.log.info("Will use %s as metadata tag name for TransformationID" %
                      self.transfidmeta)
        # # archive periof in days
        self.archiveAfter = self.am_getOption("ArchiveAfter",
                                              self.archiveAfter)  # days
        self.log.info("Will archive Completed transformations after %d days" %
                      self.archiveAfter)
        # # transformation log SEs
        self.logSE = Operations().getValue("/LogStorage/LogSE", self.logSE)
        self.log.info("Will remove logs found on storage element: %s" %
                      self.logSE)

        # # transformation client
        self.transClient = TransformationClient()
        # # wms client
        self.wmsClient = WMSClient()
        # # request client
        self.reqClient = ReqClient()
        # # file catalog client
        self.metadataClient = FileCatalogClient()
        # # job monitoring client
        self.jobMonitoringClient = JobMonitoringClient()

        return S_OK()

    #############################################################################
    def execute(self):
        """execution in one agent's cycle

        :param self: self reference
        """

        self.enableFlag = self.am_getOption("EnableFlag", self.enableFlag)
        if self.enableFlag != "True":
            self.log.info(
                "TransformationCleaningAgent is disabled by configuration option EnableFlag"
            )
            return S_OK("Disabled via CS flag")

        # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
        res = self.transClient.getTransformations({
            "Status":
            "Cleaning",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Failed to get transformations", res["Message"])

        # Obtain the transformations in RemovingFiles status and removes the output files
        res = self.transClient.getTransformations({
            "Status":
            "RemovingFiles",
            "Type":
            self.transformationTypes
        })
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeRemoval(transDict)
                else:
                    self.log.info(
                        "Removing files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeRemoval)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])

        # Obtain the transformations in Completed status and archive if inactive for X days
        olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
        res = self.transClient.getTransformations(
            {
                "Status": "Completed",
                "Type": self.transformationTypes
            },
            older=olderThanTime,
            timeStamp="LastUpdate")
        if res["OK"]:
            for transDict in res["Value"]:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])
        else:
            self.log.error("Could not get the transformations", res["Message"])
        return S_OK()

    def finalize(self):
        """Only at finalization: will clean ancient transformations (remnants)

            1) get the transformation IDs of jobs that are older than 1 year
            2) find the status of those transformations. Those "Cleaned" and "Archived" will be
               cleaned and archived (again)

        Why doing this here? Basically, it's a race:

        1) the production manager submits a transformation
        2) the TransformationAgent, and a bit later the WorkflowTaskAgent, put such transformation in their internal queue,
           so eventually during their (long-ish) cycle they'll work on it.
        3) 1 minute after creating the transformation, the production manager cleans it (by hand, for whatever reason).
           So, the status is changed to "Cleaning"
        4) the TransformationCleaningAgent cleans what has been created (maybe, nothing),
           then sets the transformation status to "Cleaned" or "Archived"
        5) a bit later the TransformationAgent, and later the WorkflowTaskAgent, kick in,
           creating tasks and jobs for a production that's effectively cleaned (but these 2 agents don't know yet).

        Of course, one could make one final check in TransformationAgent or WorkflowTaskAgent,
        but these 2 agents are already doing a lot of stuff, and are pretty heavy.
        So, we should just clean from time to time.
        What I added here is done only when the agent finalize, and it's quite light-ish operation anyway.
        """
        res = self.jobMonitoringClient.getJobGroups(
            None,
            datetime.utcnow() - timedelta(days=365))
        if not res["OK"]:
            self.log.error("Failed to get job groups", res["Message"])
            return res
        transformationIDs = res["Value"]
        if transformationIDs:
            res = self.transClient.getTransformations(
                {"TransformationID": transformationIDs})
            if not res["OK"]:
                self.log.error("Failed to get transformations", res["Message"])
                return res
            transformations = res["Value"]
            toClean = []
            toArchive = []
            for transDict in transformations:
                if transDict["Status"] == "Cleaned":
                    toClean.append(transDict)
                if transDict["Status"] == "Archived":
                    toArchive.append(transDict)

            for transDict in toClean:
                if self.shifterProxy:
                    self._executeClean(transDict)
                else:
                    self.log.info(
                        "Cleaning transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeClean)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            for transDict in toArchive:
                if self.shifterProxy:
                    self._executeArchive(transDict)
                else:
                    self.log.info(
                        "Archiving files for transformation %(TransformationID)s with %(AuthorDN)s, %(AuthorGroup)s"
                        % transDict)
                    executeWithUserProxy(self._executeArchive)(
                        transDict,
                        proxyUserDN=transDict["AuthorDN"],
                        proxyUserGroup=transDict["AuthorGroup"])

            # Remove JobIDs that were unknown to the TransformationSystem
            jobGroupsToCheck = [
                str(transDict["TransformationID"]).zfill(8)
                for transDict in toClean + toArchive
            ]
            res = self.jobMonitoringClient.getJobs(
                {"JobGroup": jobGroupsToCheck})
            if not res["OK"]:
                return res
            jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
            res = self.__removeWMSTasks(jobIDsToRemove)
            if not res["OK"]:
                return res

        return S_OK()

    def _executeClean(self, transDict):
        """Clean transformation."""
        # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # We just archive
        if transDict["Type"] in self.dataManipTTypes:
            res = self.archiveTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems archiving transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))
        else:
            res = self.cleanTransformation(transDict["TransformationID"])
            if not res["OK"]:
                self.log.error(
                    "Problems cleaning transformation",
                    "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeRemoval(self, transDict):
        """Remove files from given transformation."""
        res = self.removeTransformationOutput(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems removing transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

    def _executeArchive(self, transDict):
        """Archive the given transformation."""
        res = self.archiveTransformation(transDict["TransformationID"])
        if not res["OK"]:
            self.log.error(
                "Problems archiving transformation",
                "%s: %s" % (transDict["TransformationID"], res["Message"]))

        return S_OK()

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """get the directories for the supplied transformation from the transformation system.
            These directories are used by removeTransformationOutput and cleanTransformation for removing output.

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.verbose(
            "Cleaning Transformation directories of transformation %d" %
            transID)
        directories = []
        if "TransformationDB" in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ["OutputDirectories"])
            if not res["OK"]:
                self.log.error("Failed to obtain transformation directories",
                               res["Message"])
                return res
            transDirectories = []
            if res["Value"]:
                if not isinstance(res["Value"], list):
                    try:
                        transDirectories = ast.literal_eval(res["Value"])
                    except Exception:
                        # It can happen if the res['Value'] is '/a/b/c' instead of '["/a/b/c"]'
                        transDirectories.append(res["Value"])
                else:
                    transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if "MetadataCatalog" in self.directoryLocations:
            res = self.metadataClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res["OK"]:
                self.log.error("Failed to obtain metadata catalog directories",
                               res["Message"])
                return res
            transDirectories = res["Value"]
            directories = self._addDirs(transID, transDirectories, directories)

        if not directories:
            self.log.info("No output directories found")
        directories = sorted(directories)
        return S_OK(directories)

    @classmethod
    def _addDirs(cls, transID, newDirs, existingDirs):
        """append unique :newDirs: list to :existingDirs: list

        :param self: self reference
        :param int transID: transformationID
        :param list newDirs: src list of paths
        :param list existingDirs: dest list of paths
        """
        for folder in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, str(folder)):
                if folder not in existingDirs:
                    existingDirs.append(os.path.normpath(folder))
        return existingDirs

    #############################################################################
    #
    # These are the methods for performing the cleaning of catalogs and storage
    #

    def cleanContent(self, directory):
        """wipe out everything from catalog under folder :directory:

        :param self: self reference
        :params str directory: folder name
        """
        self.log.verbose("Cleaning Catalog contents")
        res = self.__getCatalogDirectoryContents([directory])
        if not res["OK"]:
            return res
        filesFound = res["Value"]
        if not filesFound:
            self.log.info(
                "No files are registered in the catalog directory %s" %
                directory)
            return S_OK()
        self.log.info(
            "Attempting to remove possible remnants from the catalog and storage",
            "(n=%d)" % len(filesFound))

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(filesFound, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        realFailure = False
        for lfn, reason in res["Value"]["Failed"].items():
            if "File does not exist" in str(reason):
                self.log.warn("File %s not found in some catalog: " % (lfn))
            else:
                self.log.error("Failed to remove file found in the catalog",
                               "%s %s" % (lfn, reason))
                realFailure = True
        if realFailure:
            return S_ERROR("Failed to remove all files found in the catalog")
        return S_OK()

    def __getCatalogDirectoryContents(self, directories):
        """get catalog contents under paths :directories:

        :param self: self reference
        :param list directories: list of paths in catalog
        """
        self.log.info("Obtaining the catalog contents for %d directories:" %
                      len(directories))
        for directory in directories:
            self.log.info(directory)
        activeDirs = directories
        allFiles = {}
        fc = FileCatalog()
        while activeDirs:
            currentDir = activeDirs[0]
            res = returnSingleResult(fc.listDirectory(currentDir))
            activeDirs.remove(currentDir)
            if not res["OK"] and "Directory does not exist" in res[
                    "Message"]:  # FIXME: DFC should return errno
                self.log.info("The supplied directory %s does not exist" %
                              currentDir)
            elif not res["OK"]:
                if "No such file or directory" in res["Message"]:
                    self.log.info("%s: %s" % (currentDir, res["Message"]))
                else:
                    self.log.error(
                        "Failed to get directory %s content" % currentDir,
                        res["Message"])
            else:
                dirContents = res["Value"]
                activeDirs.extend(dirContents["SubDirs"])
                allFiles.update(dirContents["Files"])
        self.log.info("", "Found %d files" % len(allFiles))
        return S_OK(list(allFiles))

    def cleanTransformationLogFiles(self, directory):
        """clean up transformation logs from directory :directory:

        :param self: self reference
        :param str directory: folder name
        """
        self.log.verbose("Removing log files found in the directory",
                         directory)
        res = returnSingleResult(
            StorageElement(self.logSE).removeDirectory(directory,
                                                       recursive=True))
        if not res["OK"]:
            if cmpError(res, errno.ENOENT):  # No such file or directory
                self.log.warn("Transformation log directory does not exist",
                              directory)
                return S_OK()
            self.log.error("Failed to remove log files", res["Message"])
            return res
        self.log.info("Successfully removed transformation log directory")
        return S_OK()

    #############################################################################
    #
    # These are the functional methods for archiving and cleaning transformations
    #

    def removeTransformationOutput(self, transID):
        """This just removes any mention of the output data from the catalog and storage"""
        self.log.info("Removing output data for transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res))
            return S_OK()
        directories = res["Value"]
        for directory in directories:
            if not re.search("/LOG/", directory):
                res = self.cleanContent(directory)
                if not res["OK"]:
                    return res

        self.log.info("Removed %d directories from the catalog \
      and its files from the storage for transformation %s" %
                      (len(directories), transID))
        # Clean ALL the possible remnants found in the metadata catalog
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully removed output of transformation", transID)
        # Change the status of the transformation to RemovedFiles
        res = self.transClient.setTransformationParameter(
            transID, "Status", "RemovedFiles")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to RemovedFiles"
                % (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to RemovedFiles" %
                      (transID))
        return S_OK()

    def archiveTransformation(self, transID):
        """This just removes job from the jobDB and the transformation DB

        :param self: self reference
        :param int transID: transformation ID
        """
        self.log.info("Archiving transformation %s" % transID)
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully archived transformation %d" % transID)
        # Change the status of the transformation to archived
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Archived")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Archived" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation %s to Archived" %
                      (transID))
        return S_OK()

    def cleanTransformation(self, transID):
        """This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
        """
        self.log.info("Cleaning transformation", transID)
        res = self.getTransformationDirectories(transID)
        if not res["OK"]:
            self.log.error("Problem obtaining directories for transformation",
                           "%s with result '%s'" % (transID, res["Message"]))
            return S_OK()
        directories = res["Value"]
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res["OK"]:
            return res
        # Clean the log files for the jobs
        for directory in directories:
            if re.search("/LOG/", directory):
                res = self.cleanTransformationLogFiles(directory)
                if not res["OK"]:
                    return res
            res = self.cleanContent(directory)
            if not res["OK"]:
                return res

        # Clean ALL the possible remnants found
        res = self.cleanMetadataCatalogFiles(transID)
        if not res["OK"]:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res["OK"]:
            return res
        self.log.info("Successfully cleaned transformation", transID)
        res = self.transClient.setTransformationParameter(
            transID, "Status", "Cleaned")
        if not res["OK"]:
            self.log.error(
                "Failed to update status of transformation %s to Cleaned" %
                (transID), res["Message"])
            return res
        self.log.info("Updated status of transformation",
                      "%s to Cleaned" % (transID))
        return S_OK()

    def cleanMetadataCatalogFiles(self, transID):
        """wipe out files from catalog"""
        res = self.metadataClient.findFilesByMetadata(
            {self.transfidmeta: transID})
        if not res["OK"]:
            return res
        fileToRemove = res["Value"]
        if not fileToRemove:
            self.log.info("No files found for transID", transID)
            return S_OK()

        # Executing with shifter proxy
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "false")
        res = DataManager().removeFile(fileToRemove, force=True)
        gConfigurationData.setOptionInCFG(
            "/DIRAC/Security/UseServerCertificate", "true")

        if not res["OK"]:
            return res
        for lfn, reason in res["Value"]["Failed"].items():
            self.log.error("Failed to remove file found in metadata catalog",
                           "%s %s" % (lfn, reason))
        if res["Value"]["Failed"]:
            return S_ERROR(
                "Failed to remove all files found in the metadata catalog")
        self.log.info("Successfully removed all files found in the DFC")
        return S_OK()

    #############################################################################
    #
    # These are the methods for removing the jobs from the WMS and transformation DB
    #

    def cleanTransformationTasks(self, transID):
        """clean tasks from WMS, or from the RMS if it is a DataManipulation transformation"""
        self.log.verbose("Cleaning Transformation tasks of transformation",
                         transID)
        res = self.__getTransformationExternalIDs(transID)
        if not res["OK"]:
            return res
        externalIDs = res["Value"]
        if externalIDs:
            res = self.transClient.getTransformationParameters(
                transID, ["Type"])
            if not res["OK"]:
                self.log.error("Failed to determine transformation type")
                return res
            transType = res["Value"]
            if transType in self.dataProcTTypes:
                res = self.__removeWMSTasks(externalIDs)
            else:
                res = self.__removeRequests(externalIDs)
            if not res["OK"]:
                return res
        return S_OK()

    def __getTransformationExternalIDs(self, transID):
        """collect all ExternalIDs for transformation :transID:

        :param self: self reference
        :param int transID: transforamtion ID
        """
        res = self.transClient.getTransformationTasks(
            condDict={"TransformationID": transID})
        if not res["OK"]:
            self.log.error(
                "Failed to get externalIDs for transformation %d" % transID,
                res["Message"])
            return res
        externalIDs = [taskDict["ExternalID"] for taskDict in res["Value"]]
        self.log.info("Found %d tasks for transformation" % len(externalIDs))
        return S_OK(externalIDs)

    def __removeRequests(self, requestIDs):
        """This will remove requests from the RMS system -"""
        rIDs = [int(int(j)) for j in requestIDs if int(j)]
        for reqID in rIDs:
            self.reqClient.cancelRequest(reqID)

        return S_OK()

    def __removeWMSTasks(self, transJobIDs):
        """delete jobs (mark their status as "JobStatus.DELETED") and their requests from the system

        :param self: self reference
        :param list trasnJobIDs: job IDs
        """
        # Prevent 0 job IDs
        jobIDs = [int(j) for j in transJobIDs if int(j)]
        allRemove = True
        for jobList in breakListIntoChunks(jobIDs, 500):

            res = self.wmsClient.killJob(jobList)
            if res["OK"]:
                self.log.info("Successfully killed %d jobs from WMS" %
                              len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to kill jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to kill jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

            res = self.wmsClient.deleteJob(jobList)
            if res["OK"]:
                self.log.info("Successfully deleted jobs from WMS",
                              "(n=%d)" % len(jobList))
            elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs"
                                               not in res) and ("FailedJobIDs"
                                                                not in res):
                self.log.info("Found jobs which did not exist in the WMS",
                              "(n=%d)" % len(res["InvalidJobIDs"]))
            elif "NonauthorizedJobIDs" in res:
                self.log.error("Failed to delete jobs because not authorized",
                               "(n=%d)" % len(res["NonauthorizedJobIDs"]))
                allRemove = False
            elif "FailedJobIDs" in res:
                self.log.error("Failed to delete jobs",
                               "(n=%d)" % len(res["FailedJobIDs"]))
                allRemove = False

        if not allRemove:
            return S_ERROR("Failed to delete all remnants from WMS")
        self.log.info("Successfully deleted all tasks from the WMS")

        if not jobIDs:
            self.log.info(
                "JobIDs not present, unable to delete associated requests.")
            return S_OK()

        failed = 0
        failoverRequests = {}
        res = self.reqClient.getRequestIDsForJobs(jobIDs)
        if not res["OK"]:
            self.log.error("Failed to get requestID for jobs.", res["Message"])
            return res
        failoverRequests.update(res["Value"]["Successful"])
        if not failoverRequests:
            return S_OK()
        for jobID, requestID in res["Value"]["Successful"].items():
            # Put this check just in case, tasks must have associated jobs
            if jobID == 0 or jobID == "0":
                continue
            res = self.reqClient.cancelRequest(requestID)
            if not res["OK"]:
                self.log.error("Failed to remove request from RequestDB",
                               res["Message"])
                failed += 1
            else:
                self.log.verbose("Removed request %s associated to job %d." %
                                 (requestID, jobID))

        if failed:
            self.log.info("Successfully removed requests",
                          "(n=%d)" % (len(failoverRequests) - failed))
            self.log.info("Failed to remove requests", "(n=%d)" % failed)
            return S_ERROR("Failed to remove all the request from RequestDB")
        self.log.info(
            "Successfully removed all the associated failover requests")
        return S_OK()
예제 #17
0
class TransformationCleaningAgent( AgentModule ):
  """
  .. class:: TransformationCleaningAgent

  :param DataManger dm: DataManager instance
  :param TransfromationClient transClient: TransfromationClient instance
  :param FileCatalogClient metadataClient: FileCatalogClient instance

  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    # # data manager
    self.dm = None
    # # transformation client
    self.transClient = None
    # # wms client
    self.wmsClient = None
    # # request client
    self.reqClient = None
    # # file catalog client
    self.metadataClient = None

    # # transformations types
    self.transformationTypes = None
    # # directory locations
    self.directoryLocations = None
    # # transformation metadata
    self.transfidmeta = None
    # # archive periof in days
    self.archiveAfter = None
    # # active SEs
    self.activeStorages = None
    # # transformation log SEs
    self.logSE = None
    # # enable/disable execution
    self.enableFlag = None

  def initialize( self ):
    """ agent initialisation

    reading and setting confing opts

    :param self: self reference
    """
    # # shifter proxy
    self.am_setOption( 'shifterProxy', 'DataManager' )
    # # transformations types
    self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] )
    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = sorted( agentTSTypes )
    else:
      self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes )
    self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) )
    # # directory locations
    self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB',
                                                                                   'MetadataCatalog' ] ) )
    self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    # # transformation metadata
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    # # archive periof in days
    self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 )  # days
    self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter )
    # # active SEs
    self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) )
    self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    # # transformation log SEs
    self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' )
    self.log.info( "Will remove logs found on storage element: %s" % self.logSE )
    # # enable/disable execution, should be using CS option Status?? with default value as 'Active'??
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )

    # # data manager
#     self.dm = DataManager()
    # # transformation client
    self.transClient = TransformationClient()
    # # wms client
    self.wmsClient = WMSClient()
    # # request client
    self.reqClient = ReqClient()
    # # file catalog client
    self.metadataClient = FileCatalogClient()

    return S_OK()

  #############################################################################
  def execute( self ):
    """ execution in one agent's cycle

    :param self: self reference
    """

    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' )
      return S_OK( 'Disabled via CS flag' )

    # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations( { 'Status' : 'Cleaning',
                                                 'Type' : self.transformationTypes } )
    if res['OK']:
      for transDict in res['Value']:
        # # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # # We just archive
        if transDict[ 'Type' ] in self.dataManipTTypes:
          res = self.archiveTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                         res['Message'] ) )
        else:
          res = self.cleanTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'],
                                                                        res['Message'] ) )


    # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
    res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles',
                                                 'Type' : self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        res = self.removeTransformationOutput( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )

    # # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter )
    res = self.transClient.getTransformations( { 'Status' : 'Completed',
                                                 'Type' : self.transformationTypes },
                                                 older = olderThanTime,
                                                 timeStamp = 'LastUpdate' )
    if res['OK']:
      for transDict in res['Value']:
        res = self.archiveTransformation( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )
    else:
      self.log.error( "Could not get the transformations" )

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    """ get the directories for the supplied transformation from the transformation system

    :param self: self reference
    :param int transID: transformation ID
    """
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        self.log.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      transDirectories = res['Value'].splitlines()
      directories = self._addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        self.log.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )

    if not directories:
      self.log.info( "No output directories found" )
    directories = sorted( directories )
    return S_OK( directories )
  # FIXME If a classmethod, should it not have cls instead of self?
  @classmethod
  def _addDirs( self, transID, newDirs, existingDirs ):
    """ append uniqe :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    """
    for folder in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, str( folder ) ):
        if not folder in existingDirs:
          existingDirs.append( folder )
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanStorageContents( self, directory ):
    """ delete lfn dir from all active SE

    :param self: self reference
    :param sre directory: folder name
    """
    for storageElement in self.activeStorages:
      res = self.__removeStorageDirectory( directory, storageElement )
      if not res['OK']:
        return res
    return S_OK()

  def __removeStorageDirectory( self, directory, storageElement ):
    """ wipe out all contents from :directory: at :storageElement:

    :param self: self reference
    :param str directory: path
    :param str storageElement: SE name
    """
    self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) )

    se = StorageElement( storageElement )

    res = se.getPfnForLfn( [directory] )
    if not res['OK']:
      self.log.error( "Failed to get PFN for directory", res['Message'] )
      return res
    if directory in res['Value']['Failed']:
      self.log.verbose( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, res['Value']['Failed'][directory] ) )
      return S_ERROR( 'Failed to obtain directory PFN from LFNs' )
    storageDirectory = res['Value']['Successful'][directory]

    res = returnSingleResult( se.exists( storageDirectory ) )
    if not res['OK']:
      self.log.error( "Failed to obtain existance of directory", res['Message'] )
      return res
    exists = res['Value']
    if not exists:
      self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) )
      return S_OK()
    res = returnSingleResult( se.removeDirectory( storageDirectory, recursive = True ) )
    if not res['OK']:
      self.log.error( "Failed to remove storage directory", res['Message'] )
      return res
    self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'],
                                                                     directory,
                                                                     storageElement ) )
    return S_OK()

  def cleanCatalogContents( self, directory ):
    """ wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    """
    res = self.__getCatalogDirectoryContents( [directory] )
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      self.log.info( "No files are registered in the catalog directory %s" % directory )
      return S_OK()
    self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) )

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )
    res = DataManager().removeFile( filesFound, force = True )
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' )

    if not res['OK']:
      return res
    realFailure = False
    for lfn, reason in res['Value']['Failed'].items():
      if "File does not exist" in str( reason ):
        self.log.warn( "File %s not found in some catalog: " % ( lfn ) )
      else:
        self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) )
        realFailure = True
    if realFailure:
      return S_ERROR( "Failed to remove all files found in the catalog" )
    return S_OK()

  def __getCatalogDirectoryContents( self, directories ):
    """ get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    """
    self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) )
    for directory in directories:
      self.log.info( directory )
    activeDirs = directories
    allFiles = {}
    fc = FileCatalog()
    while len( activeDirs ) > 0:
      currentDir = activeDirs[0]
      res = returnSingleResult( fc.listDirectory( currentDir ) )
      activeDirs.remove( currentDir )
      if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ):
        self.log.info( "The supplied directory %s does not exist" % currentDir )
      elif not res['OK']:
        if "No such file or directory" in res['Message']:
          self.log.info( "%s: %s" % ( currentDir, res['Message'] ) )
        else:
          self.log.error( "Failed to get directory %s content: %s" % ( currentDir, res['Message'] ) )
      else:
        dirContents = res['Value']
        activeDirs.extend( dirContents['SubDirs'] )
        allFiles.update( dirContents['Files'] )
    self.log.info( "Found %d files" % len( allFiles ) )
    return S_OK( allFiles.keys() )

  def cleanTransformationLogFiles( self, directory ):
    """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
    self.log.info( "Removing log files found in the directory %s" % directory )
    res = returnSingleResult( StorageElement( self.logSE ).removeDirectory( directory ) )
    if not res['OK']:
      self.log.error( "Failed to remove log files", res['Message'] )
      return res
    self.log.info( "Successfully removed transformation log directory" )
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput( self, transID ):
    """ This just removes any mention of the output data from the catalog and storage """
    self.log.info( "Removing output data for transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search( '/LOG/', directory ):
        res = self.cleanCatalogContents( directory )
        if not res['OK']:
          return res
        res = self.cleanStorageContents( directory )
        if not res['OK']:
          return res
    self.log.info( "Removed directories in the catalog and storage for transformation" )
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully removed output of transformation %d" % transID )
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) )
    return S_OK()

  def archiveTransformation( self, transID ):
    """ This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    """
    self.log.info( "Archiving transformation %s" % transID )
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully archived transformation %d" % transID )
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Archived" % ( transID ) )
    return S_OK()

  def cleanTransformation( self, transID ):
    """ This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
    """
    self.log.info( "Cleaning transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search( '/LOG/', directory ):
        res = self.cleanTransformationLogFiles( directory )
        if not res['OK']:
          return res
      res = self.cleanCatalogContents( directory )
      if not res['OK']:
        return res
      res = self.cleanStorageContents( directory )
      if not res['OK']:
        return res
    # Clean ALL the possible remnants found in the BK
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully cleaned transformation %d" % transID )
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Cleaned" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Cleaned" % ( transID ) )
    return S_OK()

  def cleanMetadataCatalogFiles( self, transID ):
    """ wipe out files from catalog """
    res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } )
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not fileToRemove:
      self.log.info( 'No files found for transID %s' % transID )
      return S_OK()

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )
    res = DataManager().removeFile( fileToRemove, force = True )
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' )

    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the metadata catalog" )
    self.log.info( "Successfully removed all files found in the BK" )
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks( self, transID ):
    """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation
    """
    res = self.__getTransformationExternalIDs( transID )
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters( transID, ['Type'] )
      if not res['OK']:
        self.log.error( "Failed to determine transformation type" )
        return res
      transType = res['Value']
      if transType in self.dataProcTTypes:
        res = self.__removeWMSTasks( externalIDs )
      else:
        res = self.__removeRequests( externalIDs )
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs( self, transID ):
    """ collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    """
    res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } )
    if not res['OK']:
      self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] )
      return res
    externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ]
    self.log.info( "Found %d tasks for transformation" % len( externalIDs ) )
    return S_OK( externalIDs )

  def __removeRequests( self, requestIDs ):
    """ This will remove requests from the (new) RMS system -

        #FIXME: if the old system is still installed, it won't remove anything!!!
        (we don't want to risk removing from the new RMS what is instead in the old)
    """
    # FIXME: checking if the old system is still installed!
    from DIRAC.ConfigurationSystem.Client import PathFinder
    if PathFinder.getServiceURL( "RequestManagement/RequestManager" ):
      self.log.warn( "NOT removing requests!!" )
      return S_OK()

    rIDs = [ int( long( j ) ) for j in requestIDs if long( j ) ]
    for requestName in rIDs:
      self.reqClient.deleteRequest( requestName )

    return S_OK()

  def __removeWMSTasks( self, transJobIDs ):
    """ wipe out jobs and their requests from the system

    TODO: should check request status, maybe FTS files as well ???

    :param self: self reference
    :param list trasnJobIDs: job IDs
    """
    # Prevent 0 job IDs
    jobIDs = [ int( j ) for j in transJobIDs if int( j ) ]
    allRemove = True
    for jobList in breakListIntoChunks( jobIDs, 500 ):

      res = self.wmsClient.killJob( jobList )
      if res['OK']:
        self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

      res = self.wmsClient.deleteJob( jobList )
      if res['OK']:
        self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

    if not allRemove:
      return S_ERROR( "Failed to remove all remnants from WMS" )
    self.log.info( "Successfully removed all tasks from the WMS" )

    if not jobIDs:
      self.log.info( "JobIDs not present, unable to remove asociated requests." )
      return S_OK()

    failed = 0
    # FIXME: double request client: old/new -> only the new will survive sooner or later
    # this is the old
    try:
      res = RequestClient().getRequestForJobs( jobIDs )
      if not res['OK']:
        self.log.error( "Failed to get requestID for jobs.", res['Message'] )
        return res
      failoverRequests = res['Value']
      self.log.info( "Found %d jobs with associated failover requests (in the old RMS)" % len( failoverRequests ) )
      if not failoverRequests:
        return S_OK()
      for jobID, requestName in failoverRequests.items():
        # Put this check just in case, tasks must have associated jobs
        if jobID == 0 or jobID == '0':
          continue
        res = RequestClient().deleteRequest( requestName )
        if not res['OK']:
          self.log.error( "Failed to remove request from RequestDB", res['Message'] )
          failed += 1
        else:
          self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )
    except RuntimeError:
      failoverRequests = {}
      pass

    # FIXME: and this is the new
    res = self.reqClient.getRequestNamesForJobs( jobIDs )
    if not res['OK']:
      self.log.error( "Failed to get requestID for jobs.", res['Message'] )
      return res
    failoverRequests.update( res['Value']['Successful'] )
    if not failoverRequests:
      return S_OK()
    for jobID, requestName in res['Value']['Successful'].items():
      # Put this check just in case, tasks must have associated jobs
      if jobID == 0 or jobID == '0':
        continue
      res = self.reqClient.deleteRequest( requestName )
      if not res['OK']:
        self.log.error( "Failed to remove request from RequestDB", res['Message'] )
        failed += 1
      else:
        self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )


    if failed:
      self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) )
      self.log.info( "Failed to remove %s requests" % failed )
      return S_ERROR( "Failed to remove all the request from RequestDB" )
    self.log.info( "Successfully removed all the associated failover requests" )
    return S_OK()
예제 #18
0
    def test_FullChain(self):
        """This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
        """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertTrue(isinstance(res["Value"], int),
                        msg="Got %s" % type(res["Value"]))
        self.assertEqual(res["Value"],
                         res["JobID"],
                         msg="Got %s, expected %s" %
                         (str(res["Value"]), res["JobID"]))
        jobID = res["JobID"]
        jobID = res["Value"]

        # updating the status
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING,
                                          "Executing Minchiapp", "source")
        self.assertTrue(res["OK"], res.get("Message"))

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assertTrue(res["OK"], res.get("Message"))

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobsStatus(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"][jobID]["Status"],
                         JobStatus.RECEIVED,
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobsMinorStatus([jobID])
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         {jobID: {
                             "MinorStatus": "Job Rescheduled"
                         }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobsApplicationStatus([jobID])
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         {jobID: {
                             "ApplicationStatus": "Unknown"
                         }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobsStates([jobID])
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(
            res["Value"],
            {
                jobID: {
                    "Status": JobStatus.RECEIVED,
                    "MinorStatus": "Job Rescheduled",
                    "ApplicationStatus": "Unknown"
                }
            },
            msg="Got %s" % str(res["Value"]),
        )

        # updating the status again
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.CHECKING,
                                          "checking", "source")
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.WAITING, "waiting",
                                          "source")
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.MATCHED, "matched",
                                          "source")
        self.assertTrue(res["OK"], res.get("Message"))

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobsStatus(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"][jobID]["Status"],
                         JobStatus.KILLED,
                         msg="Got %s" % str(res["Value"]))

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobsStatus(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"][jobID]["Status"],
                         JobStatus.DELETED,
                         msg="Got %s" % str(res["Value"]))
예제 #19
0
  def test_JobStateUpdateAndJobMonitoringMultuple( self ):
    """ # Now, let's submit some jobs. Different sites, types, inputs
    """
    wmsClient = WMSClient()
    jobMonitor = JobMonitoringClient()
    jobStateUpdate = RPCClient( 'WorkloadManagement/JobStateUpdate' )

    jobIDs = []
    dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
    lfnss = [['/a/1.txt', '/a/2.txt'], ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
    types = ['User', 'Test']
    for dest in dests:
      for lfns in lfnss:
        for jobType in types:
          job = helloWorldJob()
          job.setDestination( dest )
          job.setInputData( lfns )
          job.setType( jobType )
          jobDescription = createFile( job )
          res = wmsClient.submitJob( job._toJDL( xmlFile = jobDescription ) )
          self.assert_( res['OK'] )
          jobID = res['Value']
          jobIDs.append( jobID )

    res = jobMonitor.getSites()
    self.assert_( res['OK'] )
    self.assert_( set( res['Value'] ) <= set( dests + ['ANY', 'DIRAC.Jenkins.org'] ) )
    res = jobMonitor.getJobTypes()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( types ) )
    res = jobMonitor.getApplicationStates()
    self.assert_( res['OK'] )
    self.assertEqual( sorted( res['Value'] ), sorted( ['Unknown'] ) )

    res = jobMonitor.getOwners()
    self.assert_( res['OK'] )
    res = jobMonitor.getOwnerGroup()
    self.assert_( res['OK'] )
    res = jobMonitor.getProductionIds()
    self.assert_( res['OK'] )
    res = jobMonitor.getJobGroups()
    self.assert_( res['OK'] )
    res = jobMonitor.getStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Received'], sorted( ['Received', 'Waiting'] )] )
    res = jobMonitor.getMinorStates()
    self.assert_( res['OK'] )
    self.assert_( sorted( res['Value'] ) in [['Job accepted'], sorted( ['Job accepted', 'matching'] ) ] )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobs()
    self.assert_( res['OK'] )
    self.assert_( set( [str( x ) for x in jobIDs] ) <= set( res['Value'] ) )
#     res = jobMonitor.getCounters(attrList)
#     self.assert_( res['OK'] )
    res = jobMonitor.getCurrentJobCounters()
    self.assert_( res['OK'] )
    try:
      self.assert_( res['Value'].get( 'Received' ) + res['Value'].get( 'Waiting' ) >= long( len( dests ) * len( lfnss ) * len( types ) ) )
    except TypeError:
      pass
    res = jobMonitor.getJobsSummary( jobIDs )
    self.assert_( res['OK'] )
    res = jobMonitor.getJobPageSummaryWeb( {}, [], 0, 100 )
    self.assert_( res['OK'] )

    res = jobStateUpdate.setJobStatusBulk( jobID, {str( datetime.datetime.utcnow() ):{'Status': 'Running',
                                                                                      'MinorStatus': 'MinorStatus',
                                                                                      'ApplicationStatus': 'ApplicationStatus',
                                                                                      'Source': 'Unknown'}} )
    self.assert_( res['OK'] )
    res = jobStateUpdate.setJobsParameter( {jobID:['Status', 'Running']} )
    self.assert_( res['OK'] )

    # delete the jobs - this will just set its status to "deleted"
    wmsClient.deleteJob( jobIDs )
예제 #20
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        jobIDs = []
        dests = ['DIRAC.site1.org', 'DIRAC.site2.org']
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for dest in dests:
            for lfns in lfnss:
                for jobType in types:
                    job = helloWorldJob()
                    job.setDestination(dest)
                    job.setInputData(lfns)
                    job.setType(jobType)
                    jobDescription = createFile(job)
                    res = wmsClient.submitJob(
                        job._toJDL(xmlFile=jobDescription))
                    self.assert_(res['OK'])
                    jobID = res['Value']
                    jobIDs.append(jobID)

        res = jobMonitor.getSites()
        self.assert_(res['OK'])
        self.assert_(
            set(res['Value']) <= set(dests + ['ANY', 'DIRAC.Jenkins.ch']))
        res = jobMonitor.getJobTypes()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(types))
        res = jobMonitor.getApplicationStates()
        self.assert_(res['OK'])
        self.assertEqual(sorted(res['Value']), sorted(['Unknown']))

        res = jobMonitor.getOwners()
        self.assert_(res['OK'])
        res = jobMonitor.getOwnerGroup()
        self.assert_(res['OK'])
        res = jobMonitor.getProductionIds()
        self.assert_(res['OK'])
        res = jobMonitor.getJobGroups()
        self.assert_(res['OK'])
        res = jobMonitor.getStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assert_(res['OK'])
        self.assert_(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(['Job accepted', 'matching'])])
        self.assert_(res['OK'])
        res = jobMonitor.getJobs()
        self.assert_(res['OK'])
        self.assert_(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assert_( res['OK'] )
        res = jobMonitor.getCurrentJobCounters()
        self.assert_(res['OK'])
        try:
            self.assert_(
                res['Value'].get('Received') + res['Value'].get('Waiting') >=
                long(len(dests) * len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assert_(res['OK'])
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assert_(res['OK'])

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assert_(res['OK'])

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
예제 #21
0
class TransformationCleaningAgent(AgentModule):

    #############################################################################
    def initialize(self):
        """Sets defaults """
        self.replicaManager = ReplicaManager()
        self.transClient = TransformationClient()
        self.wmsClient = WMSClient()
        self.requestClient = RequestClient()
        self.metadataClient = FileCatalogClient()
        self.storageUsageClient = StorageUsageClient()

        # This sets the Default Proxy to used as that defined under
        # /Operations/Shifter/DataManager
        # the shifterProxy option in the Configuration can be used to change this default.
        self.am_setOption('shifterProxy', 'DataManager')

        self.transformationTypes = sortList(
            self.am_getOption('TransformationTypes', [
                'MCSimulation', 'DataReconstruction', 'DataStripping',
                'MCStripping', 'Merge', 'Replication'
            ]))
        gLogger.info("Will consider the following transformation types: %s" %
                     str(self.transformationTypes))
        self.directoryLocations = sortList(
            self.am_getOption(
                'DirectoryLocations',
                ['TransformationDB', 'StorageUsage', 'MetadataCatalog']))
        gLogger.info(
            "Will search for directories in the following locations: %s" %
            str(self.directoryLocations))
        self.transfidmeta = self.am_getOption('TransfIDMeta',
                                              "TransformationID")
        gLogger.info("Will use %s as metadata tag name for TransformationID" %
                     self.transfidmeta)
        self.archiveAfter = self.am_getOption('ArchiveAfter', 7)  # days
        gLogger.info("Will archive Completed transformations after %d days" %
                     self.archiveAfter)
        self.activeStorages = sortList(self.am_getOption('ActiveSEs', []))
        gLogger.info("Will check the following storage elements: %s" %
                     str(self.activeStorages))
        self.logSE = self.am_getOption('TransformationLogSE', 'LogSE')
        gLogger.info("Will remove logs found on storage element: %s" %
                     self.logSE)
        return S_OK()

    #############################################################################
    def execute(self):
        """ The TransformationCleaningAgent execution method.
    """
        self.enableFlag = self.am_getOption('EnableFlag', 'True')
        if not self.enableFlag == 'True':
            self.log.info(
                'TransformationCleaningAgent is disabled by configuration option %s/EnableFlag'
                % (self.section))
            return S_OK('Disabled via CS flag')

        # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
        res = self.transClient.getTransformations({
            'Status':
            'Cleaning',
            'Type':
            self.transformationTypes
        })
        if res['OK']:
            for transDict in res['Value']:
                self.cleanTransformation(transDict['TransformationID'])

        # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
        res = self.transClient.getTransformations({
            'Status':
            'RemovingFiles',
            'Type':
            self.transformationTypes
        })
        if res['OK']:
            for transDict in res['Value']:
                self.removeTransformationOutput(transDict['TransformationID'])

        # Obtain the transformations in Completed status and archive if inactive for X days
        olderThanTime = datetime.utcnow() - timedelta(days=self.archiveAfter)
        res = self.transClient.getTransformations(
            {
                'Status': 'Completed',
                'Type': self.transformationTypes
            },
            older=olderThanTime)
        if res['OK']:
            for transDict in res['Value']:
                self.archiveTransformation(transDict['TransformationID'])

        return S_OK()

    #############################################################################
    #
    # Get the transformation directories for checking
    #

    def getTransformationDirectories(self, transID):
        """ Get the directories for the supplied transformation from the transformation system """
        directories = []
        if 'TransformationDB' in self.directoryLocations:
            res = self.transClient.getTransformationParameters(
                transID, ['OutputDirectories'])
            if not res['OK']:
                gLogger.error("Failed to obtain transformation directories",
                              res['Message'])
                return res
            transDirectories = res['Value'].splitlines()
            directories = self.__addDirs(transID, transDirectories,
                                         directories)

        if 'StorageUsage' in self.directoryLocations:
            res = self.storageUsageClient.getStorageDirectories(
                '', '', transID, [])
            if not res['OK']:
                gLogger.error("Failed to obtain storage usage directories",
                              res['Message'])
                return res
            transDirectories = res['Value']
            directories = self.__addDirs(transID, transDirectories,
                                         directories)

        if 'MetadataCatalog' in self.directoryLocations:
            res = self.metadataClient.findDirectoriesByMetadata(
                {self.transfidmeta: transID})
            if not res['OK']:
                gLogger.error("Failed to obtain metadata catalog directories",
                              res['Message'])
                return res
            transDirectories = res['Value']
            directories = self.__addDirs(transID, transDirectories,
                                         directories)
        if not directories:
            gLogger.info("No output directories found")
        directories = sortList(directories)
        return S_OK(directories)

    def __addDirs(self, transID, newDirs, existingDirs):
        for dir in newDirs:
            transStr = str(transID).zfill(8)
            if re.search(transStr, dir):
                if not dir in existingDirs:
                    existingDirs.append(dir)
        return existingDirs

    #############################################################################
    #
    # These are the methods for performing the cleaning of catalogs and storage
    #

    def cleanStorageContents(self, directory):
        for storageElement in self.activeStorages:
            res = self.__removeStorageDirectory(directory, storageElement)
            if not res['OK']:
                return res
        return S_OK()

    def __removeStorageDirectory(self, directory, storageElement):
        gLogger.info('Removing the contents of %s at %s' %
                     (directory, storageElement))
        res = self.replicaManager.getPfnForLfn([directory], storageElement)
        if not res['OK']:
            gLogger.error("Failed to get PFN for directory", res['Message'])
            return res
        for directory, error in res['Value']['Failed'].items():
            gLogger.error('Failed to obtain directory PFN from LFN',
                          '%s %s' % (directory, error))
        if res['Value']['Failed']:
            return S_ERROR('Failed to obtain directory PFN from LFNs')
        storageDirectory = res['Value']['Successful'].values()[0]
        res = self.replicaManager.getStorageFileExists(storageDirectory,
                                                       storageElement,
                                                       singleFile=True)
        if not res['OK']:
            gLogger.error("Failed to obtain existance of directory",
                          res['Message'])
            return res
        exists = res['Value']
        if not exists:
            gLogger.info("The directory %s does not exist at %s " %
                         (directory, storageElement))
            return S_OK()
        res = self.replicaManager.removeStorageDirectory(storageDirectory,
                                                         storageElement,
                                                         recursive=True,
                                                         singleDirectory=True)
        if not res['OK']:
            gLogger.error("Failed to remove storage directory", res['Message'])
            return res
        gLogger.info("Successfully removed %d files from %s at %s" %
                     (res['Value']['FilesRemoved'], directory, storageElement))
        return S_OK()

    def cleanCatalogContents(self, directory):
        res = self.__getCatalogDirectoryContents([directory])
        if not res['OK']:
            return res
        filesFound = res['Value']
        if not filesFound:
            return S_OK()
        gLogger.info(
            "Attempting to remove %d possible remnants from the catalog and storage"
            % len(filesFound))
        res = self.replicaManager.removeFile(filesFound)
        if not res['OK']:
            return res
        for lfn, reason in res['Value']['Failed'].items():
            gLogger.error("Failed to remove file found in the catalog",
                          "%s %s" % (lfn, reason))
        if res['Value']['Failed']:
            return S_ERROR("Failed to remove all files found in the catalog")
        return S_OK()

    def __getCatalogDirectoryContents(self, directories):
        gLogger.info('Obtaining the catalog contents for %d directories:' %
                     len(directories))
        for directory in directories:
            gLogger.info(directory)
        activeDirs = directories
        allFiles = {}
        while len(activeDirs) > 0:
            currentDir = activeDirs[0]
            res = self.replicaManager.getCatalogListDirectory(currentDir,
                                                              singleFile=True)
            activeDirs.remove(currentDir)
            if not res['OK'] and res['Message'].endswith(
                    'The supplied path does not exist'):
                gLogger.info("The supplied directory %s does not exist" %
                             currentDir)
            elif not res['OK']:
                gLogger.error('Failed to get directory contents',
                              '%s %s' % (currentDir, res['Message']))
            else:
                dirContents = res['Value']
                activeDirs.extend(dirContents['SubDirs'])
                allFiles.update(dirContents['Files'])
        gLogger.info("Found %d files" % len(allFiles))
        return S_OK(allFiles.keys())

    def cleanTransformationLogFiles(self, directory):
        gLogger.info("Removing log files found in the directory %s" %
                     directory)
        res = self.replicaManager.removeStorageDirectory(directory,
                                                         self.logSE,
                                                         singleDirectory=True)
        if not res['OK']:
            gLogger.error("Failed to remove log files", res['Message'])
            return res
        gLogger.info("Successfully removed transformation log directory")
        return S_OK()

    #############################################################################
    #
    # These are the functional methods for archiving and cleaning transformations
    #

    def removeTransformationOutput(self, transID):
        """ This just removes any mention of the output data from the catalog and storage """
        gLogger.info("Removing output data for transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res['OK']:
            gLogger.error(
                'Problem obtaining directories for transformation %s with result "%s"'
                % (transID, res))
            return S_OK()
        directories = res['Value']
        for directory in directories:
            if not re.search('/LOG/', directory):
                res = self.cleanCatalogContents(directory)
                if not res['OK']:
                    return res
                res = self.cleanStorageContents(directory)
                if not res['OK']:
                    return res
        gLogger.info(
            "Removed directories in the catalog and storage for transformation"
        )
        # Clean ALL the possible remnants found in the metadata catalog
        res = self.cleanMetadataCatalogFiles(transID, directories)
        if not res['OK']:
            return res
        gLogger.info("Successfully removed output of transformation %d" %
                     transID)
        # Change the status of the transformation to RemovedFiles
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'RemovedFiles')
        if not res['OK']:
            gLogger.error(
                "Failed to update status of transformation %s to RemovedFiles"
                % (transID), res['Message'])
            return res
        gLogger.info("Updated status of transformation %s to RemovedFiles" %
                     (transID))
        return S_OK()

    def archiveTransformation(self, transID):
        """ This just removes job from the jobDB and the transformation DB """
        gLogger.info("Archiving transformation %s" % transID)
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res['OK']:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res['OK']:
            return res
        gLogger.info("Successfully archived transformation %d" % transID)
        # Change the status of the transformation to archived
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'Archived')
        if not res['OK']:
            gLogger.error(
                "Failed to update status of transformation %s to Archived" %
                (transID), res['Message'])
            return res
        gLogger.info("Updated status of transformation %s to Archived" %
                     (transID))
        return S_OK()

    def cleanTransformation(self, transID):
        """ This removes any mention of the supplied transformation 
    """
        gLogger.info("Cleaning transformation %s" % transID)
        res = self.getTransformationDirectories(transID)
        if not res['OK']:
            gLogger.error(
                'Problem obtaining directories for transformation %s with result "%s"'
                % (transID, res))
            return S_OK()
        directories = res['Value']
        # Clean the jobs in the WMS and any failover requests found
        res = self.cleanTransformationTasks(transID)
        if not res['OK']:
            return res
        # Clean the log files for the jobs
        for directory in directories:
            if re.search('/LOG/', directory):
                res = self.cleanTransformationLogFiles(directory)
                if not res['OK']:
                    return res
            res = self.cleanCatalogContents(directory)
            if not res['OK']:
                return res
            res = self.cleanStorageContents(directory)
            if not res['OK']:
                return res
        # Clean ALL the possible remnants found in the BK
        res = self.cleanMetadataCatalogFiles(transID, directories)
        if not res['OK']:
            return res
        # Clean the transformation DB of the files and job information
        res = self.transClient.cleanTransformation(transID)
        if not res['OK']:
            return res
        gLogger.info("Successfully cleaned transformation %d" % transID)
        # Change the status of the transformation to deleted
        res = self.transClient.setTransformationParameter(
            transID, 'Status', 'Deleted')
        if not res['OK']:
            gLogger.error(
                "Failed to update status of transformation %s to Deleted" %
                (transID), res['Message'])
            return res
        gLogger.info("Updated status of transformation %s to Deleted" %
                     (transID))
        return S_OK()

    def cleanMetadataCatalogFiles(self, transID, directories):
        res = self.metadataClient.findFilesByMetadata(
            {self.transfidmeta: transID})
        if not res['OK']:
            return res
        fileToRemove = res['Value']
        if not len(fileToRemove):
            gLogger.info('No files found for transID %s' % transID)
            return S_OK()
        res = self.replicaManager.removeFile(fileToRemove)
        if not res['OK']:
            return res
        for lfn, reason in res['Value']['Failed'].items():
            gLogger.error("Failed to remove file found in metadata catalog",
                          "%s %s" % (lfn, reason))
        if res['Value']['Failed']:
            return S_ERROR(
                "Failed to remove all files found in the metadata catalog")
        gLogger.info("Successfully removed all files found in the BK")
        return S_OK()

    #############################################################################
    #
    # These are the methods for removing the jobs from the WMS and transformation DB
    #

    def cleanTransformationTasks(self, transID):
        res = self.__getTransformationExternalIDs(transID)
        if not res['OK']:
            return res
        externalIDs = res['Value']
        if externalIDs:
            res = self.transClient.getTransformationParameters(
                transID, ['Type'])
            if not res['OK']:
                gLogger.error("Failed to determine transformation type")
                return res
            transType = res['Value']
            if transType == 'Replication':
                res = self.__removeRequests(externalIDs)
            else:
                res = self.__removeWMSTasks(externalIDs)
            if not res['OK']:
                return res
        return S_OK()

    def __getTransformationExternalIDs(self, transID):
        res = self.transClient.getTransformationTasks(
            condDict={'TransformationID': transID})
        if not res['OK']:
            gLogger.error(
                "Failed to get externalIDs for transformation %d" % transID,
                res['Message'])
            return res
        externalIDs = []
        for taskDict in res['Value']:
            externalIDs.append(taskDict['ExternalID'])
        gLogger.info("Found %d tasks for transformation" % len(externalIDs))
        return S_OK(externalIDs)

    def __removeRequests(self, requestIDs):
        gLogger.error("Not removing requests but should do")
        return S_OK()

    def __removeWMSTasks(self, jobIDs):
        allRemove = True
        for jobList in breakListIntoChunks(jobIDs, 500):
            res = self.wmsClient.deleteJob(jobList)
            if res['OK']:
                gLogger.info("Successfully removed %d jobs from WMS" %
                             len(jobList))
            elif (res.has_key('InvalidJobIDs')) and (
                    not res.has_key('NonauthorizedJobIDs')) and (
                        not res.has_key('FailedJobIDs')):
                gLogger.info("Found %s jobs which did not exist in the WMS" %
                             len(res['InvalidJobIDs']))
            elif res.has_key('NonauthorizedJobIDs'):
                gLogger.error(
                    "Failed to remove %s jobs because not authorized" %
                    len(res['NonauthorizedJobIDs']))
                allRemove = False
            elif res.has_key('FailedJobIDs'):
                gLogger.error("Failed to remove %s jobs" %
                              len(res['FailedJobIDs']))
                allRemove = False
        if not allRemove:
            return S_ERROR("Failed to remove all remnants from WMS")
        gLogger.info("Successfully removed all tasks from the WMS")
        res = self.requestClient.getRequestForJobs(jobIDs)
        if not res['OK']:
            gLogger.error("Failed to get requestID for jobs.", res['Message'])
            return res
        failoverRequests = res['Value']
        gLogger.info("Found %d jobs with associated failover requests" %
                     len(failoverRequests))
        if not failoverRequests:
            return S_OK()
        failed = 0
        for jobID, requestName in failoverRequests.items():
            res = self.requestClient.deleteRequest(requestName)
            if not res['OK']:
                gLogger.error("Failed to remove request from RequestDB",
                              res['Message'])
                failed += 1
            else:
                gLogger.verbose("Removed request %s associated to job %d." %
                                (requestName, jobID))
        if failed:
            gLogger.info("Successfully removed %s requests" %
                         (len(failoverRequests) - failed))
            gLogger.info("Failed to remove %s requests" % failed)
            return S_ERROR("Failed to remove all the request from RequestDB")
        gLogger.info(
            "Successfully removed all the associated failover requests")
        return S_OK()
예제 #22
0
    def test_JobStateUpdateAndJobMonitoringMultuple(self):
        """ # Now, let's submit some jobs. Different sites, types, inputs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        jobIDs = []
        lfnss = [['/a/1.txt', '/a/2.txt'],
                 ['/a/1.txt', '/a/3.txt', '/a/4.txt'], []]
        types = ['User', 'Test']
        for lfns in lfnss:
            for jobType in types:
                job = helloWorldJob()
                job.setDestination('DIRAC.Jenkins.ch')
                job.setInputData(lfns)
                job.setType(jobType)
                jobDescription = createFile(job)
                res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
                self.assertTrue(res['OK'], res.get('Message'))
                jobID = res['Value']
            jobIDs.append(jobID)

        res = jobMonitor.getSites()
        print(res)
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set(res['Value']) <= {'ANY', 'DIRAC.Jenkins.ch'},
                        msg="Got %s" % res['Value'])
        res = jobMonitor.getJobTypes()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(types),
                         msg="Got %s" % str(sorted(res['Value'])))
        res = jobMonitor.getApplicationStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertEqual(sorted(res['Value']),
                         sorted(['Unknown']),
                         msg="Got %s" % sorted(str(res['Value'])))

        res = jobMonitor.getOwners()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getOwnerGroup()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getProductionIds()
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobGroups()
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_empty = res['Value']
        res = jobMonitor.getJobGroups(None, datetime.datetime.utcnow())
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanNow = res['Value']
        self.assertEqual(resJG_empty, resJG_olderThanNow)
        res = jobMonitor.getJobGroups(
            None,
            datetime.datetime.utcnow() - datetime.timedelta(days=365))
        self.assertTrue(res['OK'], res.get('Message'))
        resJG_olderThanOneYear = res['Value']
        self.assertTrue(
            set(resJG_olderThanOneYear).issubset(set(resJG_olderThanNow)))
        res = jobMonitor.getStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Received'],
                                     sorted(['Received', 'Waiting'])])
        res = jobMonitor.getMinorStates()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(
            sorted(res['Value']) in [['Job accepted'],
                                     sorted(
                                         ['Job accepted', 'Job Rescheduled'])])
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobs()
        self.assertTrue(res['OK'], res.get('Message'))
        self.assertTrue(set([str(x) for x in jobIDs]) <= set(res['Value']))
        #     res = jobMonitor.getCounters(attrList)
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getCurrentJobCounters()
        self.assertTrue(res['OK'], res.get('Message'))
        try:
            self.assertTrue(
                res['Value'].get('Received') +
                res['Value'].get('Waiting') >= int(len(lfnss) * len(types)))
        except TypeError:
            pass
        res = jobMonitor.getJobsSummary(jobIDs)
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobMonitor.getJobPageSummaryWeb({}, [], 0, 100)
        self.assertTrue(res['OK'], res.get('Message'))

        res = jobStateUpdate.setJobStatusBulk(
            jobID, {
                str(datetime.datetime.utcnow()): {
                    'Status': 'Running',
                    'MinorStatus': 'MinorStatus',
                    'ApplicationStatus': 'ApplicationStatus',
                    'Source': 'Unknown'
                }
            })
        self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobsParameter({jobID: ['Status', 'Running']})
        self.assertTrue(res['OK'], res.get('Message'))

        # delete the jobs - this will just set its status to "deleted"
        wmsClient.deleteJob(jobIDs)
예제 #23
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """Verifying all JobStateUpdate and JobMonitoring functions"""
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = JobStateUpdateClient()

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assertTrue(res["OK"], res.get("Message"))
        jobID = int(res["Value"])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobJDL(jobID, False)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobOwner(jobID)
        self.assertTrue(res["OK"], res.get("Message"))

        # Adding stuff

        # forcing the update
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.RUNNING, "running",
                                          "source", None, True)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobParameters(jobID, [("par1", "par1Value"),
                                                      ("par2", "par2Value")])
        time.sleep(5)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobApplicationStatus(jobID, "app status",
                                                     "source")
        self.assertTrue(res["OK"], res.get("Message"))
        #     res = jobStateUpdate.setJobFlag()
        #     self.assertTrue(res['OK'], res.get('Message'))
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assertTrue(res['OK'], res.get('Message'))
        res = jobStateUpdate.setJobSite(jobID, "Site")
        self.assertTrue(res["OK"], res.get("Message"))

        # now checking few things
        res = jobMonitor.getJobsStatus(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"][jobID]["Status"],
                         JobStatus.RUNNING,
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameter(jobID, "par1")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], {"par1": "par1Value"},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameters(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         {jobID: {
                             "par1": "par1Value",
                             "par2": "par2Value"
                         }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobParameters(jobID, "par1")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], {jobID: {
            "par1": "par1Value"
        }},
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobAttribute(jobID, "Site")
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"],
                         "Site",
                         msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobAttributes(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        self.assertEqual(res["Value"]["JobName"],
                         "helloWorld",
                         msg="Got %s" % str(res["Value"]["JobName"]))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        self.assertEqual(res["Value"]["Status"],
                         JobStatus.RUNNING,
                         msg="Got %s" % str(res["Value"]["Status"]))
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getInputData(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"], [], msg="Got %s" % str(res["Value"]))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobStateUpdate.setJobStatus(jobID, JobStatus.DONE, "MinorStatus",
                                          "Unknown")
        self.assertTrue(res["OK"], res.get("Message"))
        res = jobMonitor.getJobSummary(jobID)
        self.assertTrue(res["OK"], res.get("Message"))
        self.assertEqual(res["Value"]["Status"],
                         JobStatus.DONE,
                         msg="Got %s" % str(res["Value"]["Status"]))
        self.assertEqual(res["Value"]["MinorStatus"],
                         "MinorStatus",
                         msg="Got %s" % str(res["Value"]["MinorStatus"]))
        self.assertEqual(res["Value"]["ApplicationStatus"],
                         "app status",
                         msg="Got %s" % str(res["Value"]["ApplicationStatus"]))
        res = jobStateUpdate.sendHeartBeat(jobID, {"bih": "bih"},
                                           {"boh": "boh"})
        self.assertTrue(res["OK"], res.get("Message"))

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)
예제 #24
0
    def test_JobStateUpdateAndJobMonitoring(self):
        """ Verifying all JobStateUpdate and JobMonitoring functions
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create a job and check stuff
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submitting the job. Checking few stuff
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        jobID = int(res['Value'])
        # jobID = res['JobID']
        res = jobMonitor.getJobJDL(jobID, True)
        self.assert_(res['OK'])
        res = jobMonitor.getJobJDL(jobID, False)
        self.assert_(res['OK'])
        res = jobMonitor.getJobsParameters([jobID], [])
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {})
        res = jobMonitor.getJobsParameters([jobID], ['Owner'])
        self.assert_(res['OK'])

        # Adding stuff
        res = jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching',
                                          'source')
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobParameters(jobID, [('par1', 'par1Value'),
                                                      ('par2', 'par2Value')])
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobApplicationStatus(jobID, 'app status',
                                                     'source')
        self.assert_(res['OK'])
        #     res = jobStateUpdate.setJobFlag()
        #     self.assert_( res['OK'] )
        #     res = jobStateUpdate.unsetJobFlag()
        #     self.assert_( res['OK'] )
        res = jobStateUpdate.setJobSite(jobID, 'Site')
        self.assert_(res['OK'])
        #     res = jobMonitor.traceJobParameter( 'Site', 1, 'Status' )
        #     self.assert_( res['OK'] )

        # now checking few things
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Running')
        res = jobMonitor.getJobParameter(jobID, 'par1')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {'par1': 'par1Value'})
        res = jobMonitor.getJobParameters(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], {
            'par1': 'par1Value',
            'par2': 'par2Value'
        })
        res = jobMonitor.getJobAttribute(jobID, 'Site')
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Site')
        res = jobMonitor.getJobAttributes(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['JobName'], 'helloWorld')
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        self.assertEqual(res['Value']['Status'], 'Running')
        res = jobMonitor.getJobHeartBeatData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getInputData(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], [])
        res = jobMonitor.getJobPrimarySummary(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getAtticJobParameters(jobID)
        self.assert_(res['OK'])
        res = jobStateUpdate.setJobsStatus([jobID], 'Done', 'MinorStatus',
                                           'Unknown')
        self.assert_(res['OK'])
        res = jobMonitor.getJobSummary(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value']['Status'], 'Done')
        self.assertEqual(res['Value']['MinorStatus'], 'MinorStatus')
        self.assertEqual(res['Value']['ApplicationStatus'], 'app status')
        res = jobStateUpdate.sendHeartBeat(jobID, {'bih': 'bih'},
                                           {'boh': 'boh'})
        self.assert_(res['OK'])

        # delete the job - this will just set its status to "deleted"
        wmsClient.deleteJob(jobID)
class TransformationCleaningAgent( AgentModule ):

  #############################################################################
  def initialize( self ):
    """Sets defaults """
    self.replicaManager = ReplicaManager()
    self.transClient = TransformationClient()
    self.wmsClient = WMSClient()
    self.requestClient = RequestClient()
    self.metadataClient = FileCatalogClient()
    self.storageUsageClient = StorageUsageClient()

    # This sets the Default Proxy to used as that defined under 
    # /Operations/Shifter/DataManager
    # the shifterProxy option in the Configuration can be used to change this default.
    self.am_setOption( 'shifterProxy', 'DataManager' )

    self.transformationTypes = sortList( self.am_getOption( 'TransformationTypes', ['MCSimulation', 'DataReconstruction', 'DataStripping', 'MCStripping', 'Merge', 'Replication'] ) )
    gLogger.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) )
    self.directoryLocations = sortList( self.am_getOption( 'DirectoryLocations', ['TransformationDB', 'StorageUsage', 'MetadataCatalog'] ) )
    gLogger.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    gLogger.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 ) # days
    gLogger.info( "Will archive Completed transformations after %d days" % self.archiveAfter )
    self.activeStorages = sortList( self.am_getOption( 'ActiveSEs', [] ) )
    gLogger.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' )
    gLogger.info( "Will remove logs found on storage element: %s" % self.logSE )
    return S_OK()

  #############################################################################
  def execute( self ):
    """ The TransformationCleaningAgent execution method.
    """
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'TransformationCleaningAgent is disabled by configuration option %s/EnableFlag' % ( self.section ) )
      return S_OK( 'Disabled via CS flag' )

    # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations( {'Status':'Cleaning', 'Type':self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        self.cleanTransformation( transDict['TransformationID'] )

    # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
    res = self.transClient.getTransformations( {'Status':'RemovingFiles', 'Type':self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        self.removeTransformationOutput( transDict['TransformationID'] )

    # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter )
    res = self.transClient.getTransformations( {'Status':'Completed', 'Type':self.transformationTypes}, older = olderThanTime )
    if res['OK']:
      for transDict in res['Value']:
        self.archiveTransformation( transDict['TransformationID'] )

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    """ Get the directories for the supplied transformation from the transformation system """
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        gLogger.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      transDirectories = res['Value'].splitlines()
      directories = self.__addDirs( transID, transDirectories, directories )

    if 'StorageUsage' in self.directoryLocations:
      res = self.storageUsageClient.getStorageDirectories( '', '', transID, [] )
      if not res['OK']:
        gLogger.error( "Failed to obtain storage usage directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self.__addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        gLogger.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self.__addDirs( transID, transDirectories, directories )
    if not directories:
      gLogger.info( "No output directories found" )
    directories = sortList( directories )
    return S_OK( directories )

  def __addDirs( self, transID, newDirs, existingDirs ):
    for dir in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, dir ):
        if not dir in existingDirs:
          existingDirs.append( dir )
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanStorageContents( self, directory ):
    for storageElement in self.activeStorages:
      res = self.__removeStorageDirectory( directory, storageElement )
      if not res['OK']:
        return res
    return S_OK()

  def __removeStorageDirectory( self, directory, storageElement ):
    gLogger.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) )
    res = self.replicaManager.getPfnForLfn( [directory], storageElement )
    if not res['OK']:
      gLogger.error( "Failed to get PFN for directory", res['Message'] )
      return res
    for directory, error in res['Value']['Failed'].items():
      gLogger.error( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, error ) )
    if res['Value']['Failed']:
      return S_ERROR( 'Failed to obtain directory PFN from LFNs' )
    storageDirectory = res['Value']['Successful'].values()[0]
    res = self.replicaManager.getStorageFileExists( storageDirectory, storageElement, singleFile = True )
    if not res['OK']:
      gLogger.error( "Failed to obtain existance of directory", res['Message'] )
      return res
    exists = res['Value']
    if not exists:
      gLogger.info( "The directory %s does not exist at %s " % ( directory, storageElement ) )
      return S_OK()
    res = self.replicaManager.removeStorageDirectory( storageDirectory, storageElement, recursive = True, singleDirectory = True )
    if not res['OK']:
      gLogger.error( "Failed to remove storage directory", res['Message'] )
      return res
    gLogger.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'], directory, storageElement ) )
    return S_OK()

  def cleanCatalogContents( self, directory ):
    res = self.__getCatalogDirectoryContents( [directory] )
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      return S_OK()
    gLogger.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) )
    res = self.replicaManager.removeFile( filesFound )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      gLogger.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the catalog" )
    return S_OK()

  def __getCatalogDirectoryContents( self, directories ):
    gLogger.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) )
    for directory in directories:
      gLogger.info( directory )
    activeDirs = directories
    allFiles = {}
    while len( activeDirs ) > 0:
      currentDir = activeDirs[0]
      res = self.replicaManager.getCatalogListDirectory( currentDir, singleFile = True )
      activeDirs.remove( currentDir )
      if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ):
        gLogger.info( "The supplied directory %s does not exist" % currentDir )
      elif not res['OK']:
        gLogger.error( 'Failed to get directory contents', '%s %s' % ( currentDir, res['Message'] ) )
      else:
        dirContents = res['Value']
        activeDirs.extend( dirContents['SubDirs'] )
        allFiles.update( dirContents['Files'] )
    gLogger.info( "Found %d files" % len( allFiles ) )
    return S_OK( allFiles.keys() )

  def cleanTransformationLogFiles( self, directory ):
    gLogger.info( "Removing log files found in the directory %s" % directory )
    res = self.replicaManager.removeStorageDirectory( directory, self.logSE, singleDirectory = True )
    if not res['OK']:
      gLogger.error( "Failed to remove log files", res['Message'] )
      return res
    gLogger.info( "Successfully removed transformation log directory" )
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput( self, transID ):
    """ This just removes any mention of the output data from the catalog and storage """
    gLogger.info( "Removing output data for transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      gLogger.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search( '/LOG/', directory ):
        res = self.cleanCatalogContents( directory )
        if not res['OK']:
          return res
        res = self.cleanStorageContents( directory )
        if not res['OK']:
          return res
    gLogger.info( "Removed directories in the catalog and storage for transformation" )
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles( transID, directories )
    if not res['OK']:
      return res
    gLogger.info( "Successfully removed output of transformation %d" % transID )
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' )
    if not res['OK']:
      gLogger.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] )
      return res
    gLogger.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) )
    return S_OK()

  def archiveTransformation( self, transID ):
    """ This just removes job from the jobDB and the transformation DB """
    gLogger.info( "Archiving transformation %s" % transID )
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    gLogger.info( "Successfully archived transformation %d" % transID )
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' )
    if not res['OK']:
      gLogger.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] )
      return res
    gLogger.info( "Updated status of transformation %s to Archived" % ( transID ) )
    return S_OK()

  def cleanTransformation( self, transID ):
    """ This removes any mention of the supplied transformation 
    """
    gLogger.info( "Cleaning transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      gLogger.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search( '/LOG/', directory ):
        res = self.cleanTransformationLogFiles( directory )
        if not res['OK']:
          return res
      res = self.cleanCatalogContents( directory )
      if not res['OK']:
        return res
      res = self.cleanStorageContents( directory )
      if not res['OK']:
        return res
    # Clean ALL the possible remnants found in the BK
    res = self.cleanMetadataCatalogFiles( transID, directories )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    gLogger.info( "Successfully cleaned transformation %d" % transID )
    # Change the status of the transformation to deleted
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Deleted' )
    if not res['OK']:
      gLogger.error( "Failed to update status of transformation %s to Deleted" % ( transID ), res['Message'] )
      return res
    gLogger.info( "Updated status of transformation %s to Deleted" % ( transID ) )
    return S_OK()

  def cleanMetadataCatalogFiles( self, transID, directories ):
    res = self.metadataClient.findFilesByMetadata( {self.transfidmeta:transID} )
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not len(fileToRemove):
      gLogger.info('No files found for transID %s'%transID)
      return S_OK()
    res = self.replicaManager.removeFile( fileToRemove )
    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      gLogger.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the metadata catalog" )
    gLogger.info( "Successfully removed all files found in the BK" )
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks( self, transID ):
    res = self.__getTransformationExternalIDs( transID )
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters( transID, ['Type'] )
      if not res['OK']:
        gLogger.error( "Failed to determine transformation type" )
        return res
      transType = res['Value']
      if transType == 'Replication':
        res = self.__removeRequests( externalIDs )
      else:
        res = self.__removeWMSTasks( externalIDs )
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs( self, transID ):
    res = self.transClient.getTransformationTasks( condDict = {'TransformationID':transID} )
    if not res['OK']:
      gLogger.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] )
      return res
    externalIDs = []
    for taskDict in res['Value']:
      externalIDs.append( taskDict['ExternalID'] )
    gLogger.info( "Found %d tasks for transformation" % len( externalIDs ) )
    return S_OK( externalIDs )

  def __removeRequests( self, requestIDs ):
    gLogger.error( "Not removing requests but should do" )
    return S_OK()

  def __removeWMSTasks( self, jobIDs ):
    allRemove = True
    for jobList in breakListIntoChunks( jobIDs, 500 ):
      res = self.wmsClient.deleteJob( jobList )
      if res['OK']:
        gLogger.info( "Successfully removed %d jobs from WMS" % len( jobList ) )
      elif ( res.has_key( 'InvalidJobIDs' ) ) and ( not res.has_key( 'NonauthorizedJobIDs' ) ) and ( not res.has_key( 'FailedJobIDs' ) ):
        gLogger.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif res.has_key( 'NonauthorizedJobIDs' ):
        gLogger.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif res.has_key( 'FailedJobIDs' ):
        gLogger.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False
    if not allRemove:
      return S_ERROR( "Failed to remove all remnants from WMS" )
    gLogger.info( "Successfully removed all tasks from the WMS" )
    res = self.requestClient.getRequestForJobs( jobIDs )
    if not res['OK']:
      gLogger.error( "Failed to get requestID for jobs.", res['Message'] )
      return res
    failoverRequests = res['Value']
    gLogger.info( "Found %d jobs with associated failover requests" % len( failoverRequests ) )
    if not failoverRequests:
      return S_OK()
    failed = 0
    for jobID, requestName in failoverRequests.items():
      res = self.requestClient.deleteRequest( requestName )
      if not res['OK']:
        gLogger.error( "Failed to remove request from RequestDB", res['Message'] )
        failed += 1
      else:
        gLogger.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )
    if failed:
      gLogger.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) )
      gLogger.info( "Failed to remove %s requests" % failed )
      return S_ERROR( "Failed to remove all the request from RequestDB" )
    gLogger.info( "Successfully removed all the associated failover requests" )
    return S_OK()
예제 #26
0
    def test_FullChain(self):
        """ This test will

        - call all the WMSClient methods
          that will end up calling all the JobManager service methods
        - use the JobMonitoring to verify few properties
        - call the JobCleaningAgent to eliminate job entries from the DBs
    """
        wmsClient = WMSClient()
        jobMonitor = JobMonitoringClient()
        jobStateUpdate = RPCClient('WorkloadManagement/JobStateUpdate')

        # create the job
        job = helloWorldJob()
        jobDescription = createFile(job)

        # submit the job
        res = wmsClient.submitJob(job._toJDL(xmlFile=jobDescription))
        self.assert_(res['OK'])
        # self.assertEqual( type( res['Value'] ), int )
        # self.assertEqual( res['Value'], res['JobID'] )
        # jobID = res['JobID']
        jobID = res['Value']

        # updating the status
        jobStateUpdate.setJobStatus(jobID, 'Running', 'Executing Minchiapp',
                                    'source')

        # reset the job
        res = wmsClient.resetJob(jobID)
        self.assert_(res['OK'])

        # reschedule the job
        res = wmsClient.rescheduleJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Received')

        # updating the status again
        jobStateUpdate.setJobStatus(jobID, 'Matched', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Killed')

        # updating the status aaaagain
        jobStateUpdate.setJobStatus(jobID, 'Done', 'matching', 'source')

        # kill the job
        res = wmsClient.killJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'],
                         'Done')  # this time it won't kill... it's done!

        # delete the job - this will just set its status to "deleted"
        res = wmsClient.deleteJob(jobID)
        self.assert_(res['OK'])
        res = jobMonitor.getJobStatus(jobID)
        self.assert_(res['OK'])
        self.assertEqual(res['Value'], 'Deleted')
예제 #27
0
class TransformationCleaningAgent( AgentModule ):
  """
  .. class:: TransformationCleaningAgent

  :param DataManger dm: DataManager instance
  :param TransfromationClient transClient: TransfromationClient instance
  :param FileCatalogClient metadataClient: FileCatalogClient instance

  """

  def __init__( self, *args, **kwargs ):
    """ c'tor
    """
    AgentModule.__init__( self, *args, **kwargs )

    # # data manager
    self.dm = None
    # # transformation client
    self.transClient = None
    # # wms client
    self.wmsClient = None
    # # request client
    self.reqClient = None
    # # file catalog client
    self.metadataClient = None

    # # transformations types
    self.transformationTypes = None
    # # directory locations
    self.directoryLocations = None
    # # transformation metadata
    self.transfidmeta = None
    # # archive periof in days
    self.archiveAfter = None
    # # active SEs
    self.activeStorages = None
    # # transformation log SEs
    self.logSE = None
    # # enable/disable execution
    self.enableFlag = None

  def initialize( self ):
    """ agent initialisation

    reading and setting confing opts

    :param self: self reference
    """
    # # shifter proxy
    self.am_setOption( 'shifterProxy', 'DataManager' )
    # # transformations types
    self.dataProcTTypes = Operations().getValue( 'Transformations/DataProcessing', ['MCSimulation', 'Merge'] )
    self.dataManipTTypes = Operations().getValue( 'Transformations/DataManipulation', ['Replication', 'Removal'] )
    agentTSTypes = self.am_getOption( 'TransformationTypes', [] )
    if agentTSTypes:
      self.transformationTypes = sorted( agentTSTypes )
    else:
      self.transformationTypes = sorted( self.dataProcTTypes + self.dataManipTTypes )
    self.log.info( "Will consider the following transformation types: %s" % str( self.transformationTypes ) )
    # # directory locations
    self.directoryLocations = sorted( self.am_getOption( 'DirectoryLocations', [ 'TransformationDB',
                                                                                   'MetadataCatalog' ] ) )
    self.log.info( "Will search for directories in the following locations: %s" % str( self.directoryLocations ) )
    # # transformation metadata
    self.transfidmeta = self.am_getOption( 'TransfIDMeta', "TransformationID" )
    self.log.info( "Will use %s as metadata tag name for TransformationID" % self.transfidmeta )
    # # archive periof in days
    self.archiveAfter = self.am_getOption( 'ArchiveAfter', 7 )  # days
    self.log.info( "Will archive Completed transformations after %d days" % self.archiveAfter )
    # # active SEs
    self.activeStorages = sorted( self.am_getOption( 'ActiveSEs', [] ) )
    self.log.info( "Will check the following storage elements: %s" % str( self.activeStorages ) )
    # # transformation log SEs
    self.logSE = self.am_getOption( 'TransformationLogSE', 'LogSE' )
    self.log.info( "Will remove logs found on storage element: %s" % self.logSE )
    # # enable/disable execution, should be using CS option Status?? with default value as 'Active'??
    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )

    # # data manager
#     self.dm = DataManager()
    # # transformation client
    self.transClient = TransformationClient()
    # # wms client
    self.wmsClient = WMSClient()
    # # request client
    self.reqClient = ReqClient()
    # # file catalog client
    self.metadataClient = FileCatalogClient()

    return S_OK()

  #############################################################################
  def execute( self ):
    """ execution in one agent's cycle

    :param self: self reference
    """

    self.enableFlag = self.am_getOption( 'EnableFlag', 'True' )
    if not self.enableFlag == 'True':
      self.log.info( 'TransformationCleaningAgent is disabled by configuration option EnableFlag' )
      return S_OK( 'Disabled via CS flag' )

    # # Obtain the transformations in Cleaning status and remove any mention of the jobs/files
    res = self.transClient.getTransformations( { 'Status' : 'Cleaning',
                                                 'Type' : self.transformationTypes } )
    if res['OK']:
      for transDict in res['Value']:
        # # if transformation is of type `Replication` or `Removal`, there is nothing to clean.
        # # We just archive
        if transDict[ 'Type' ] in self.dataManipTTypes:
          res = self.archiveTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                         res['Message'] ) )
        else:
          res = self.cleanTransformation( transDict['TransformationID'] )
          if not res['OK']:
            self.log.error( "Problems cleaning transformation %s: %s" % ( transDict['TransformationID'],
                                                                        res['Message'] ) )


    # # Obtain the transformations in RemovingFiles status and (wait for it) removes the output files
    res = self.transClient.getTransformations( { 'Status' : 'RemovingFiles',
                                                 'Type' : self.transformationTypes} )
    if res['OK']:
      for transDict in res['Value']:
        res = self.removeTransformationOutput( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems removing transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )

    # # Obtain the transformations in Completed status and archive if inactive for X days
    olderThanTime = datetime.utcnow() - timedelta( days = self.archiveAfter )
    res = self.transClient.getTransformations( { 'Status' : 'Completed',
                                                 'Type' : self.transformationTypes },
                                                 older = olderThanTime,
                                                 timeStamp = 'LastUpdate' )
    if res['OK']:
      for transDict in res['Value']:
        res = self.archiveTransformation( transDict['TransformationID'] )
        if not res['OK']:
          self.log.error( "Problems archiving transformation %s: %s" % ( transDict['TransformationID'],
                                                                       res['Message'] ) )
    else:
      self.log.error( "Could not get the transformations" )

    return S_OK()

  #############################################################################
  #
  # Get the transformation directories for checking
  #

  def getTransformationDirectories( self, transID ):
    """ get the directories for the supplied transformation from the transformation system

    :param self: self reference
    :param int transID: transformation ID
    """
    directories = []
    if 'TransformationDB' in self.directoryLocations:
      res = self.transClient.getTransformationParameters( transID, ['OutputDirectories'] )
      if not res['OK']:
        self.log.error( "Failed to obtain transformation directories", res['Message'] )
        return res
      if type( res['Value'] ) != type( [] ):
        transDirectories = ast.literal_eval( res['Value'] )
      else:
        transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )

    if 'MetadataCatalog' in self.directoryLocations:
      res = self.metadataClient.findDirectoriesByMetadata( {self.transfidmeta:transID} )
      if not res['OK']:
        self.log.error( "Failed to obtain metadata catalog directories", res['Message'] )
        return res
      transDirectories = res['Value']
      directories = self._addDirs( transID, transDirectories, directories )

    if not directories:
      self.log.info( "No output directories found" )
    directories = sorted( directories )
    return S_OK( directories )
  # FIXME If a classmethod, should it not have cls instead of self?
  @classmethod
  def _addDirs( self, transID, newDirs, existingDirs ):
    """ append uniqe :newDirs: list to :existingDirs: list

    :param self: self reference
    :param int transID: transformationID
    :param list newDirs: src list of paths
    :param list existingDirs: dest list of paths
    """
    for folder in newDirs:
      transStr = str( transID ).zfill( 8 )
      if re.search( transStr, str( folder ) ):
        if not folder in existingDirs:
          existingDirs.append( folder )
    return existingDirs

  #############################################################################
  #
  # These are the methods for performing the cleaning of catalogs and storage
  #

  def cleanStorageContents( self, directory ):
    """ delete lfn dir from all active SE

    :param self: self reference
    :param sre directory: folder name
    """
    for storageElement in self.activeStorages:
      res = self.__removeStorageDirectory( directory, storageElement )
      if not res['OK']:
        return res
    return S_OK()

  def __removeStorageDirectory( self, directory, storageElement ):
    """ wipe out all contents from :directory: at :storageElement:

    :param self: self reference
    :param str directory: path
    :param str storageElement: SE name
    """
    self.log.info( 'Removing the contents of %s at %s' % ( directory, storageElement ) )

    se = StorageElement( storageElement )

    res = se.getPfnForLfn( [directory] )
    if not res['OK']:
      self.log.error( "Failed to get PFN for directory", res['Message'] )
      return res
    if directory in res['Value']['Failed']:
      self.log.verbose( 'Failed to obtain directory PFN from LFN', '%s %s' % ( directory, res['Value']['Failed'][directory] ) )
      return S_ERROR( 'Failed to obtain directory PFN from LFNs' )
    storageDirectory = res['Value']['Successful'][directory]

    res = returnSingleResult( se.exists( storageDirectory ) )
    if not res['OK']:
      self.log.error( "Failed to obtain existance of directory", res['Message'] )
      return res
    exists = res['Value']
    if not exists:
      self.log.info( "The directory %s does not exist at %s " % ( directory, storageElement ) )
      return S_OK()
    res = returnSingleResult( se.removeDirectory( storageDirectory, recursive = True ) )
    if not res['OK']:
      self.log.error( "Failed to remove storage directory", res['Message'] )
      return res
    self.log.info( "Successfully removed %d files from %s at %s" % ( res['Value']['FilesRemoved'],
                                                                     directory,
                                                                     storageElement ) )
    return S_OK()

  def cleanCatalogContents( self, directory ):
    """ wipe out everything from catalog under folder :directory:

    :param self: self reference
    :params str directory: folder name
    """
    res = self.__getCatalogDirectoryContents( [directory] )
    if not res['OK']:
      return res
    filesFound = res['Value']
    if not filesFound:
      self.log.info( "No files are registered in the catalog directory %s" % directory )
      return S_OK()
    self.log.info( "Attempting to remove %d possible remnants from the catalog and storage" % len( filesFound ) )

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )
    res = DataManager().removeFile( filesFound, force = True )
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' )

    if not res['OK']:
      return res
    realFailure = False
    for lfn, reason in res['Value']['Failed'].items():
      if "File does not exist" in str( reason ):
        self.log.warn( "File %s not found in some catalog: " % ( lfn ) )
      else:
        self.log.error( "Failed to remove file found in the catalog", "%s %s" % ( lfn, reason ) )
        realFailure = True
    if realFailure:
      return S_ERROR( "Failed to remove all files found in the catalog" )
    return S_OK()

  def __getCatalogDirectoryContents( self, directories ):
    """ get catalog contents under paths :directories:

    :param self: self reference
    :param list directories: list of paths in catalog
    """
    self.log.info( 'Obtaining the catalog contents for %d directories:' % len( directories ) )
    for directory in directories:
      self.log.info( directory )
    activeDirs = directories
    allFiles = {}
    fc = FileCatalog()
    while len( activeDirs ) > 0:
      currentDir = activeDirs[0]
      res = returnSingleResult( fc.listDirectory( currentDir ) )
      activeDirs.remove( currentDir )
      if not res['OK'] and res['Message'].endswith( 'The supplied path does not exist' ):
        self.log.info( "The supplied directory %s does not exist" % currentDir )
      elif not res['OK']:
        if "No such file or directory" in res['Message']:
          self.log.info( "%s: %s" % ( currentDir, res['Message'] ) )
        else:
          self.log.error( "Failed to get directory %s content: %s" % ( currentDir, res['Message'] ) )
      else:
        dirContents = res['Value']
        activeDirs.extend( dirContents['SubDirs'] )
        allFiles.update( dirContents['Files'] )
    self.log.info( "Found %d files" % len( allFiles ) )
    return S_OK( allFiles.keys() )

  def cleanTransformationLogFiles( self, directory ):
    """ clean up transformation logs from directory :directory:

    :param self: self reference
    :param str directory: folder name
    """
    self.log.info( "Removing log files found in the directory %s" % directory )
    res = returnSingleResult( StorageElement( self.logSE ).removeDirectory( directory ) )
    if not res['OK']:
      self.log.error( "Failed to remove log files", res['Message'] )
      return res
    self.log.info( "Successfully removed transformation log directory" )
    return S_OK()

  #############################################################################
  #
  # These are the functional methods for archiving and cleaning transformations
  #

  def removeTransformationOutput( self, transID ):
    """ This just removes any mention of the output data from the catalog and storage """
    self.log.info( "Removing output data for transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    for directory in directories:
      if not re.search( '/LOG/', directory ):
        res = self.cleanCatalogContents( directory )
        if not res['OK']:
          return res
        res = self.cleanStorageContents( directory )
        if not res['OK']:
          return res
    self.log.info( "Removed directories in the catalog and storage for transformation" )
    # Clean ALL the possible remnants found in the metadata catalog
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully removed output of transformation %d" % transID )
    # Change the status of the transformation to RemovedFiles
    res = self.transClient.setTransformationParameter( transID, 'Status', 'RemovedFiles' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to RemovedFiles" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to RemovedFiles" % ( transID ) )
    return S_OK()

  def archiveTransformation( self, transID ):
    """ This just removes job from the jobDB and the transformation DB

    :param self: self reference
    :param int transID: transformation ID
    """
    self.log.info( "Archiving transformation %s" % transID )
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully archived transformation %d" % transID )
    # Change the status of the transformation to archived
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Archived' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Archived" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Archived" % ( transID ) )
    return S_OK()

  def cleanTransformation( self, transID ):
    """ This removes what was produced by the supplied transformation,
        leaving only some info and log in the transformation DB.
    """
    self.log.info( "Cleaning transformation %s" % transID )
    res = self.getTransformationDirectories( transID )
    if not res['OK']:
      self.log.error( 'Problem obtaining directories for transformation %s with result "%s"' % ( transID, res ) )
      return S_OK()
    directories = res['Value']
    # Clean the jobs in the WMS and any failover requests found
    res = self.cleanTransformationTasks( transID )
    if not res['OK']:
      return res
    # Clean the log files for the jobs
    for directory in directories:
      if re.search( '/LOG/', directory ):
        res = self.cleanTransformationLogFiles( directory )
        if not res['OK']:
          return res
      res = self.cleanCatalogContents( directory )
      if not res['OK']:
        return res
      res = self.cleanStorageContents( directory )
      if not res['OK']:
        return res
    # Clean ALL the possible remnants found in the BK
    res = self.cleanMetadataCatalogFiles( transID )
    if not res['OK']:
      return res
    # Clean the transformation DB of the files and job information
    res = self.transClient.cleanTransformation( transID )
    if not res['OK']:
      return res
    self.log.info( "Successfully cleaned transformation %d" % transID )
    res = self.transClient.setTransformationParameter( transID, 'Status', 'Cleaned' )
    if not res['OK']:
      self.log.error( "Failed to update status of transformation %s to Cleaned" % ( transID ), res['Message'] )
      return res
    self.log.info( "Updated status of transformation %s to Cleaned" % ( transID ) )
    return S_OK()

  def cleanMetadataCatalogFiles( self, transID ):
    """ wipe out files from catalog """
    res = self.metadataClient.findFilesByMetadata( { self.transfidmeta : transID } )
    if not res['OK']:
      return res
    fileToRemove = res['Value']
    if not fileToRemove:
      self.log.info( 'No files found for transID %s' % transID )
      return S_OK()

    # Executing with shifter proxy
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'false' )
    res = DataManager().removeFile( fileToRemove, force = True )
    gConfigurationData.setOptionInCFG( '/DIRAC/Security/UseServerCertificate', 'true' )

    if not res['OK']:
      return res
    for lfn, reason in res['Value']['Failed'].items():
      self.log.error( "Failed to remove file found in metadata catalog", "%s %s" % ( lfn, reason ) )
    if res['Value']['Failed']:
      return S_ERROR( "Failed to remove all files found in the metadata catalog" )
    self.log.info( "Successfully removed all files found in the BK" )
    return S_OK()

  #############################################################################
  #
  # These are the methods for removing the jobs from the WMS and transformation DB
  #

  def cleanTransformationTasks( self, transID ):
    """ clean tasks from WMS, or from the RMS if it is a DataManipulation transformation
    """
    res = self.__getTransformationExternalIDs( transID )
    if not res['OK']:
      return res
    externalIDs = res['Value']
    if externalIDs:
      res = self.transClient.getTransformationParameters( transID, ['Type'] )
      if not res['OK']:
        self.log.error( "Failed to determine transformation type" )
        return res
      transType = res['Value']
      if transType in self.dataProcTTypes:
        res = self.__removeWMSTasks( externalIDs )
      else:
        res = self.__removeRequests( externalIDs )
      if not res['OK']:
        return res
    return S_OK()

  def __getTransformationExternalIDs( self, transID ):
    """ collect all ExternalIDs for transformation :transID:

    :param self: self reference
    :param int transID: transforamtion ID
    """
    res = self.transClient.getTransformationTasks( condDict = { 'TransformationID' : transID } )
    if not res['OK']:
      self.log.error( "Failed to get externalIDs for transformation %d" % transID, res['Message'] )
      return res
    externalIDs = [ taskDict['ExternalID'] for taskDict in res["Value"] ]
    self.log.info( "Found %d tasks for transformation" % len( externalIDs ) )
    return S_OK( externalIDs )

  def __removeRequests( self, requestIDs ):
    """ This will remove requests from the (new) RMS system -

        #FIXME: if the old system is still installed, it won't remove anything!!!
        (we don't want to risk removing from the new RMS what is instead in the old)
    """
    # FIXME: checking if the old system is still installed!
    from DIRAC.ConfigurationSystem.Client import PathFinder
    if PathFinder.getServiceURL( "RequestManagement/RequestManager" ):
      self.log.warn( "NOT removing requests!!" )
      return S_OK()

    rIDs = [ int( long( j ) ) for j in requestIDs if long( j ) ]
    for requestName in rIDs:
      self.reqClient.deleteRequest( requestName )

    return S_OK()

  def __removeWMSTasks( self, transJobIDs ):
    """ wipe out jobs and their requests from the system

    TODO: should check request status, maybe FTS files as well ???

    :param self: self reference
    :param list trasnJobIDs: job IDs
    """
    # Prevent 0 job IDs
    jobIDs = [ int( j ) for j in transJobIDs if int( j ) ]
    allRemove = True
    for jobList in breakListIntoChunks( jobIDs, 500 ):

      res = self.wmsClient.killJob( jobList )
      if res['OK']:
        self.log.info( "Successfully killed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to kill %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

      res = self.wmsClient.deleteJob( jobList )
      if res['OK']:
        self.log.info( "Successfully removed %d jobs from WMS" % len( jobList ) )
      elif ( "InvalidJobIDs" in res ) and ( "NonauthorizedJobIDs" not in res ) and ( "FailedJobIDs" not in res ):
        self.log.info( "Found %s jobs which did not exist in the WMS" % len( res['InvalidJobIDs'] ) )
      elif "NonauthorizedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs because not authorized" % len( res['NonauthorizedJobIDs'] ) )
        allRemove = False
      elif "FailedJobIDs" in res:
        self.log.error( "Failed to remove %s jobs" % len( res['FailedJobIDs'] ) )
        allRemove = False

    if not allRemove:
      return S_ERROR( "Failed to remove all remnants from WMS" )
    self.log.info( "Successfully removed all tasks from the WMS" )

    if not jobIDs:
      self.log.info( "JobIDs not present, unable to remove asociated requests." )
      return S_OK()

    failed = 0
    # FIXME: double request client: old/new -> only the new will survive sooner or later
    # this is the old
    try:
      res = RequestClient().getRequestForJobs( jobIDs )
      if not res['OK']:
        self.log.error( "Failed to get requestID for jobs.", res['Message'] )
        return res
      failoverRequests = res['Value']
      self.log.info( "Found %d jobs with associated failover requests (in the old RMS)" % len( failoverRequests ) )
      if not failoverRequests:
        return S_OK()
      for jobID, requestName in failoverRequests.items():
        # Put this check just in case, tasks must have associated jobs
        if jobID == 0 or jobID == '0':
          continue
        res = RequestClient().deleteRequest( requestName )
        if not res['OK']:
          self.log.error( "Failed to remove request from RequestDB", res['Message'] )
          failed += 1
        else:
          self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )
    except RuntimeError:
      failoverRequests = {}
      pass

    # FIXME: and this is the new
    res = self.reqClient.getRequestNamesForJobs( jobIDs )
    if not res['OK']:
      self.log.error( "Failed to get requestID for jobs.", res['Message'] )
      return res
    failoverRequests.update( res['Value']['Successful'] )
    if not failoverRequests:
      return S_OK()
    for jobID, requestName in res['Value']['Successful'].items():
      # Put this check just in case, tasks must have associated jobs
      if jobID == 0 or jobID == '0':
        continue
      res = self.reqClient.deleteRequest( requestName )
      if not res['OK']:
        self.log.error( "Failed to remove request from RequestDB", res['Message'] )
        failed += 1
      else:
        self.log.verbose( "Removed request %s associated to job %d." % ( requestName, jobID ) )


    if failed:
      self.log.info( "Successfully removed %s requests" % ( len( failoverRequests ) - failed ) )
      self.log.info( "Failed to remove %s requests" % failed )
      return S_ERROR( "Failed to remove all the request from RequestDB" )
    self.log.info( "Successfully removed all the associated failover requests" )
    return S_OK()