Exemple #1
0
  def getFTS3Context(self, username, group, ftsServer, threadID):
    """ Returns an fts3 context for a given user, group and fts server

        The context pool is per thread, and there is one context
        per tuple (user, group, server).
        We dump the proxy of a user to a file (shared by all the threads),
        and use it to make the context.
        The proxy needs a lifetime of at least 2h, is cached for 1.5h, and
        the lifetime of the context is 45mn

        :param username: name of the user
        :param group: group of the user
        :param ftsServer: address of the server

        :returns: S_OK with the context object

    """

    log = gLogger.getSubLogger("getFTS3Context", child=True)

    contextes = self._globalContextCache.setdefault(threadID, DictCache())

    idTuple = (username, group, ftsServer)
    log.debug("Getting context for %s" % (idTuple, ))

    if not contextes.exists(idTuple, 2700):
      res = getDNForUsername(username)
      if not res['OK']:
        return res
      # We take the first DN returned
      userDN = res['Value'][0]

      log.debug("UserDN %s" % userDN)

      # We dump the proxy to a file.
      # It has to have a lifetime of at least 2 hours
      # and we cache it for 1.5 hours
      res = gProxyManager.downloadVOMSProxyToFile(
          userDN, group, requiredTimeLeft=7200, cacheTime=5400)
      if not res['OK']:
        return res

      proxyFile = res['Value']
      log.debug("Proxy file %s" % proxyFile)

      # We generate the context
      res = FTS3Job.generateContext(ftsServer, proxyFile)
      if not res['OK']:
        return res
      context = res['Value']

      # we add it to the cache for this thread for 1h
      contextes.add(idTuple, 3600, context)

    return S_OK(contextes.get(idTuple))
Exemple #2
0
  def getFTS3Context(self, username, group, ftsServer, threadID):
    """ Returns an fts3 context for a given user, group and fts server

        The context pool is per thread, and there is one context
        per tuple (user, group, server).
        We dump the proxy of a user to a file (shared by all the threads),
        and use it to make the context.
        The proxy needs a lifetime of at least 2h, is cached for 1.5h, and
        the lifetime of the context is 45mn

        :param username: name of the user
        :param group: group of the user
        :param ftsServer: address of the server

        :returns: S_OK with the context object

    """

    log = gLogger.getSubLogger("getFTS3Context", child=True)

    contextes = self._globalContextCache.setdefault(threadID, DictCache())

    idTuple = (username, group, ftsServer)
    log.debug("Getting context for %s" % (idTuple, ))

    if not contextes.exists(idTuple, 2700):
      res = getDNForUsername(username)
      if not res['OK']:
        return res
      # We take the first DN returned
      userDN = res['Value'][0]

      log.debug("UserDN %s" % userDN)

      # We dump the proxy to a file.
      # It has to have a lifetime of at least 2 hours
      # and we cache it for 1.5 hours
      res = gProxyManager.downloadVOMSProxyToFile(
          userDN, group, requiredTimeLeft=7200, cacheTime=5400)
      if not res['OK']:
        return res

      proxyFile = res['Value']
      log.debug("Proxy file %s" % proxyFile)

      # We generate the context
      res = FTS3Job.generateContext(ftsServer, proxyFile)
      if not res['OK']:
        return res
      context = res['Value']

      # we add it to the cache for this thread for 1h
      contextes.add(idTuple, 3600, context)

    return S_OK(contextes.get(idTuple))
Exemple #3
0
    def _createNewJob(self, jobType, ftsFiles, targetSE, sourceSE=None):
        """ Create a new FTS3Job object
        :param jobType: type of job to create (Transfer, Staging, Removal)
        :param ftsFiles: list of FTS3File objects the job has to work on
        :param targetSE: SE on which to operate
        :param sourceSE: source SE, only useful for Transfer jobs

        :return FTS3Job object
     """

        newJob = FTS3Job()
        newJob.type = jobType
        newJob.sourceSE = sourceSE
        newJob.targetSE = targetSE
        newJob.activity = self.activity
        newJob.priority = self.priority
        newJob.username = self.username
        newJob.userGroup = self.userGroup
        newJob.vo = self.vo
        newJob.filesToSubmit = ftsFiles
        newJob.operationID = getattr(self, 'operationID')

        return newJob
Exemple #4
0
def test_raceCondition(fts3db):
    """This tests a race condition that was exhibited when
    running multiple agent in parallel. What was happening
    was that we were getting some nonFinishedOperations
    for further processing while some jobs associated to that
    operation were being monitored.

    This test reproduces all the possible combination of job/operation
    being assigned/non assigned

    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | OpID | OpAssigned | JobID | JobAssigned | Comment                                                                                  |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 1    |            |       |             | No job                                                                                   |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 2    | Yes        |       |             | No Job                                                                                   |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 3    |            | 1     |             | Nothing is Assigned                                                                      |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 4    |            | 2     | yes         | Job is assigned, so can't use the operation                                              |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 5    | yes        | 3     |             | Op is assigned, so can't use it                                                          |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 6    | yes        | 4     | yes         | That would be a problematic situation !!                                                 |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 7    |            | 5     | yes         | Job 5 is assigned, so Op 7 cannot be used, even if Job6 is unassigned (this was the bug) |
    |      |            | 6     |             |                                                                                          |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |
    | 8    | yes        | 7     | yes         | Op8 is assigned, so can't be used  (and is problematic like op6)                         |
    |      | yes        | 8     |             |                                                                                          |
    | ---- | ---------- | ----- | ----------- |:---------------------------------------------------------------------------------------- |

    Under these circumstances, we want:

    * getNonFinishedOperation to return operations 1 and 3
    * getActiveJobs to return jobs 1 and 6


    """

    # Utility to create a FT3File.
    # All operations must have at least one file associated
    # for the queries to make sense
    def _makeFile():
        f = FTS3File()
        f.targetSE = "targetSE"
        return f

    # op1: Non assigned operation without any job
    op1 = FTS3TransferOperation()
    op1.operationID = 1
    op1.ftsFiles.append(_makeFile())

    # op2: assigned operation without any job
    op2 = FTS3TransferOperation()
    op2.operationID = 2
    op2.ftsFiles.append(_makeFile())

    # op3: Non assigned operation with one non assigned job
    op3 = FTS3TransferOperation()
    op3.operationID = 3
    op3.ftsFiles.append(_makeFile())
    j1 = FTS3Job()
    j1.jobID = 1
    op3.ftsJobs.append(j1)

    # op4: Non assigned operation with one assigned job
    op4 = FTS3TransferOperation()
    op4.operationID = 4
    op4.ftsFiles.append(_makeFile())
    j2 = FTS3Job()
    j2.jobID = 2
    op4.ftsJobs.append(j2)

    # op5: assigned operation with one non assigned job
    op5 = FTS3TransferOperation()
    op5.operationID = 5
    op5.ftsFiles.append(_makeFile())
    j3 = FTS3Job()
    j3.jobID = 3
    op5.ftsJobs.append(j3)

    # op6: assigned operation with one assigned job
    # This is a very problematic case that we want
    # to avoid

    op6 = FTS3TransferOperation()
    op6.operationID = 6
    op6.ftsFiles.append(_makeFile())
    j4 = FTS3Job()
    j4.jobID = 4
    op6.ftsJobs.append(j4)

    # op7: Non assigned operation with one assigned job and one non assigned job
    op7 = FTS3TransferOperation()
    op7.operationID = 7
    op7.ftsFiles.append(_makeFile())
    j5 = FTS3Job()
    j5.jobID = 5
    op7.ftsJobs.append(j5)
    j6 = FTS3Job()
    op7.ftsFiles.append(_makeFile())
    j6.jobID = 6
    op7.ftsJobs.append(j6)

    # op8: assigned operation with one assigned job and one non assigned job
    # That is problematic, like op6
    op8 = FTS3TransferOperation()
    op8.operationID = 8
    j7 = FTS3Job()
    op8.ftsFiles.append(_makeFile())
    j7.jobID = 7
    op8.ftsJobs.append(j7)
    j8 = FTS3Job()
    j8.jobID = 8
    op8.ftsJobs.append(j8)

    allOps = [op1, op2, op3, op4, op5, op6, op7, op8]
    for op in allOps:
        res = fts3db.persistOperation(op)
        assert res["OK"]

    with fts3db.engine.begin() as conn:
        conn.execute(
            update(FTS3DB.fts3JobTable).values(assignment="Yes").where(
                FTS3DB.fts3JobTable.c.jobID.in_([2, 4, 5, 7])))

    with fts3db.engine.begin() as conn:
        conn.execute(
            update(FTS3DB.fts3OperationTable).values(assignment="Yes").where(
                FTS3DB.fts3OperationTable.c.operationID.in_([2, 5, 6, 8])))

    res = fts3db.getNonFinishedOperations(operationAssignmentTag=None)
    assert res["OK"]
    nonFinishedOps = res["Value"]
    nonFinishedOpsIDs = [op.operationID for op in nonFinishedOps]
    assert nonFinishedOpsIDs == [1, 3]

    res = fts3db.getActiveJobs(jobAssignmentTag=None)
    assert res["OK"]
    activeJobs = res["Value"]
    activeJobIDs = [op.jobID for op in activeJobs]
    assert activeJobIDs == [1, 6]
Exemple #5
0
    def getFTS3Context(self, username, group, ftsServer, threadID):
        """ Returns an fts3 context for a given user, group and fts server

        The context pool is per thread, and there is one context
        per tuple (user, group, server).
        We dump the proxy of a user to a file (shared by all the threads),
        and use it to make the context.
        The proxy needs a lifetime of self.proxyLifetime, is cached for cacheTime = (2*lifeTime/3) - 10mn,
        and the lifetime of the context is 45mn
        The reason for cacheTime to be what it is is because the FTS3 server will ask for a new proxy
        after 2/3rd of the existing proxy has expired, so we renew it just before

        :param str username: name of the user
        :param str group: group of the user
        :param str ftsServer: address of the server
        :param str threadID: thread ID

        :returns: S_OK with the context object

    """

        log = gLogger.getSubLogger("getFTS3Context", child=True)

        contextes = self._globalContextCache.setdefault(threadID, DictCache())

        idTuple = (username, group, ftsServer)
        log.debug("Getting context for %s" % (idTuple, ))

        # We keep a context in the cache for 45 minutes
        # (so it needs to be valid at least 15 since we add it for one hour)
        if not contextes.exists(idTuple, 15 * 60):
            res = getDNForUsername(username)
            if not res['OK']:
                return res
            # We take the first DN returned
            userDN = res['Value'][0]

            log.debug("UserDN %s" % userDN)

            # We dump the proxy to a file.
            # It has to have a lifetime of self.proxyLifetime
            # Because the FTS3 servers cache it for 2/3rd of the lifetime
            # we should make our cache a bit less than 2/3rd of the lifetime
            cacheTime = int(2 * self.proxyLifetime / 3) - 600
            res = gProxyManager.downloadVOMSProxyToFile(
                userDN,
                group,
                requiredTimeLeft=self.proxyLifetime,
                cacheTime=cacheTime)
            if not res['OK']:
                return res

            proxyFile = res['Value']
            log.debug("Proxy file %s" % proxyFile)

            # We generate the context
            # In practice, the lifetime will be less than proxyLifetime
            # because we reuse a cached proxy. However, the cached proxy will
            # never forced a redelegation, because it is recent enough for FTS3 servers.
            # The delegation is forced when 2/3 rd of the lifetime are left, and we get a fresh
            # one just before. So no problem
            res = FTS3Job.generateContext(ftsServer,
                                          proxyFile,
                                          lifetime=self.proxyLifetime)

            if not res['OK']:
                return res
            context = res['Value']

            # we add it to the cache for this thread for 1h
            contextes.add(idTuple, 3600, context)

        return S_OK(contextes.get(idTuple))
Exemple #6
0
    def test_05_cancelNotFoundJob(self):
        """When a job disappears from the server, we need to cancel it
        and its files.

        The scenario is as follow. Operation has 4 files.
        Job1 is submitted for File1 and File2.
        Job2 is submitted for File3 and File4.
        File1 is finished, and then the job disappears.
        We need to cancel Job1 and File2.
        Job2, File3 and File4 are here to make sure we do not cancel wrongly other files
        """

        op = self.generateOperation("Transfer", 4, ["Target1"])

        job1 = FTS3Job()
        job1GUID = "05-cancelall-job1"
        job1.ftsGUID = job1GUID
        job1.ftsServer = "fts3"

        job1.username = op.username
        job1.userGroup = op.userGroup

        # assign the GUID to the files
        op.ftsFiles[0].ftsGUID = job1GUID
        op.ftsFiles[1].ftsGUID = job1GUID

        # Pretend

        op.ftsJobs.append(job1)

        job2 = FTS3Job()
        job2GUID = "05-cancelall-job2"
        job2.ftsGUID = job2GUID
        job2.ftsServer = "fts3"

        job2.username = op.username
        job2.userGroup = op.userGroup

        # assign the GUID to the files
        op.ftsFiles[2].ftsGUID = job2GUID
        op.ftsFiles[3].ftsGUID = job2GUID

        op.ftsJobs.append(job2)

        res = self.db.persistOperation(op)
        opID = res["Value"]

        # Get back the operation to update all the IDs
        res = self.db.getOperation(opID)
        op = res["Value"]

        fileIds = []
        for ftsFile in op.ftsFiles:
            fileIds.append(ftsFile.fileID)

        # Now we monitor Job1, and find that the first file has failed, the second is still ongoing
        # And since File1 is in an FTS final status, we set its ftsGUID to None
        file1ID = op.ftsFiles[0].fileID
        file2ID = op.ftsFiles[1].fileID
        fileStatusDict = {
            file1ID: {
                "status": "Finished",
                "ftsGUID": None
            },
            file2ID: {
                "status": "Staging"
            }
        }

        # And when updating, take care of specifying that you are updating for a given GUID
        res = self.db.updateFileStatus(fileStatusDict, ftsGUID=job1GUID)
        self.assertTrue(res["OK"])

        # Now we monitor again, job one, and find out that job1 has disappeared
        # So we cancel the job and the files
        res = self.db.cancelNonExistingJob(opID, job1GUID)
        self.assertTrue(res["OK"])

        # And hopefully now File2 is Canceled, while the others are as they were
        res = self.client.getOperation(opID)
        op = res["Value"]

        self.assertTrue(op.ftsFiles[0].status == "Finished")
        self.assertTrue(op.ftsFiles[1].status == "Canceled")
        self.assertTrue(op.ftsFiles[1].ftsGUID is None)
        self.assertTrue(op.ftsFiles[2].status == "New")
        self.assertTrue(op.ftsFiles[3].status == "New")
Exemple #7
0
    def test_04_job_monitoring_solve_racecondition(self):
        """We used to have a race condition resulting in duplicated transfers for a file.
        This test reproduces the race condition to make sure it is fixed.
        This test makes sure that the update only happens on files concerned by the job

        The scenario is as follow. Operation has two files File1 and File2.
        Job1 is submitted for File1 and File2.
        File1 fails, File2 is still ongoing.
        We submit Job2 for File1.
        Job1 is monitored again, and we update again File1 to failed (because it is so in Job1)
        A Job3 would be created for File1, dispite Job2 still runing on it.
        """
        op = self.generateOperation("Transfer", 2, ["Target1"])

        job1 = FTS3Job()
        job1GUID = "04-racecondition-job1"
        job1.ftsGUID = job1GUID
        job1.ftsServer = "fts3"

        job1.username = op.username
        job1.userGroup = op.userGroup

        op.ftsJobs.append(job1)

        # Now, when submitting the job, we specify the ftsGUID to which files are
        # assigned
        for ftsFile in op.ftsFiles:
            ftsFile.ftsGUID = job1GUID

        res = self.client.persistOperation(op)
        opID = res["Value"]

        # Get back the operation to update all the IDs
        res = self.client.getOperation(opID)
        op = res["Value"]

        fileIds = []
        for ftsFile in op.ftsFiles:
            fileIds.append(ftsFile.fileID)

        # Arbitrarilly decide that File1 has the smalled fileID
        file1ID = min(fileIds)
        file2ID = max(fileIds)

        # Now we monitor Job1, and find that the first file has failed, the second is still ongoing
        # And since File1 is in an FTS final status, we set its ftsGUID to None
        fileStatusDict = {
            file1ID: {
                "status": "Failed",
                "error": "Someone made a boo-boo",
                "ftsGUID": None
            },
            file2ID: {
                "status": "Staging"
            },
        }

        # And when updating, take care of specifying that you are updating for a given GUID
        res = self.db.updateFileStatus(fileStatusDict, ftsGUID=job1GUID)
        self.assertTrue(res["OK"])

        # We would then submit a second job
        job2 = FTS3Job()
        job2GUID = "04-racecondition-job2"
        job2.ftsGUID = job2GUID
        job2.ftsServer = "fts3"

        job2.username = op.username
        job2.userGroup = op.userGroup

        op.ftsJobs.append(job2)

        # And do not forget to add the new FTSGUID to File1
        # assigned
        for ftsFile in op.ftsFiles:
            if ftsFile.fileID == file1ID:
                ftsFile.ftsGUID = job2GUID

        res = self.client.persistOperation(op)

        # Now we monitor Job2 & Job1 (in this order)
        fileStatusDictJob2 = {
            file1ID: {
                "status": "Staging"
            },
        }

        # Again specify the GUID
        res = self.db.updateFileStatus(fileStatusDictJob2, ftsGUID=job2GUID)
        self.assertTrue(res["OK"])

        # And in Job1, File1 is (and will remain) failed, while File2 is still ongoing
        fileStatusDictJob1 = {
            file1ID: {
                "status": "Failed",
                "error": "Someone made a boo-boo"
            },
            file2ID: {
                "status": "Staging"
            },
        }

        # And thanks to specifying the job GUID, File1 should not be touched !
        res = self.db.updateFileStatus(fileStatusDictJob1, ftsGUID=job1GUID)
        self.assertTrue(res["OK"])

        # And hopefully now there shouldn't be any file to submit
        res = self.client.getOperation(opID)
        op = res["Value"]

        # isTotallyProcessed does not return S_OK struct
        filesToSubmit = op._getFilesToSubmit()
        self.assertEqual(filesToSubmit, [])
Exemple #8
0
    def test_03_job_monitoring_racecondition(self):
        """We used to have a race condition resulting in duplicated transfers for a file.
        This test reproduces the race condition.

        The scenario is as follow. Operation has two files File1 and File2.
        Job1 is submitted for File1 and File2.
        File1 fails, File2 is still ongoing.
        We submit Job2 for File1.
        Job1 is monitored again, and we update again File1 to failed (because it is so in Job1)
        A Job3 would be created for File1, despite Job2 still running on it.
        """
        op = self.generateOperation("Transfer", 2, ["Target1"])

        job1 = FTS3Job()
        job1.ftsGUID = "03-racecondition-job1"
        job1.ftsServer = "fts3"

        job1.username = op.username
        job1.userGroup = op.userGroup

        op.ftsJobs.append(job1)

        res = self.client.persistOperation(op)
        opID = res["Value"]

        # Get back the operation to update all the IDs
        res = self.client.getOperation(opID)
        op = res["Value"]

        fileIds = []
        for ftsFile in op.ftsFiles:
            fileIds.append(ftsFile.fileID)

        file1ID = min(fileIds)
        file2ID = max(fileIds)

        # Now we monitor Job1, and find that the first file has failed, the second is still ongoing
        fileStatusDict = {
            file1ID: {
                "status": "Failed",
                "error": "Someone made a boo-boo"
            },
            file2ID: {
                "status": "Staging"
            },
        }

        res = self.db.updateFileStatus(fileStatusDict)
        self.assertTrue(res["OK"])

        # We would then submit a second job
        job2 = FTS3Job()
        job2.ftsGUID = "03-racecondition-job2"
        job2.ftsServer = "fts3"

        job2.username = op.username
        job2.userGroup = op.userGroup

        op.ftsJobs.append(job2)
        res = self.client.persistOperation(op)

        # Now we monitor Job2 & Job1 (in this order)
        fileStatusDictJob2 = {
            file1ID: {
                "status": "Staging"
            },
        }
        res = self.db.updateFileStatus(fileStatusDictJob2)
        self.assertTrue(res["OK"])

        # And in Job1, File1 is (and will remain) failed, while File2 is still ongoing
        fileStatusDictJob1 = {
            file1ID: {
                "status": "Failed",
                "error": "Someone made a boo-boo"
            },
            file2ID: {
                "status": "Staging"
            },
        }
        res = self.db.updateFileStatus(fileStatusDictJob1)
        self.assertTrue(res["OK"])

        # And now this is the problem, because If we check whether this operation still has
        # files to submit, it will tell me yes, while all the files are being taken care of
        res = self.client.getOperation(opID)
        op = res["Value"]

        # isTotallyProcessed does not return S_OK struct
        filesToSubmit = op._getFilesToSubmit()
        self.assertEqual(filesToSubmit, [op.ftsFiles[0]])