Exemple #1
0
    def test_update(self):
        from Ganga import GPI
        t = GPI.LHCbTask()
        tr = GPI.LHCbTransform(application=DaVinci(), backend=Dirac())
        t.appendTransform(tr)
        try:
            bkQueryList = [GPI.BKTestQuery(stripping20up)]
            tr.updateQuery()
            assert false, 'Should have thrown exception if updated with no query'
        except:
            tr.addQuery(GPI.BKTestQuery(stripping20down))

            # Check some new data added
            assert len(tr.inputdata), 'No data added after call to update'

            try:
                # Shouldn't allow a second update before processed the data in
                # toProcess_dataset
                tr.updateQuery()
                assert false, 'Should have thrown an error if updated with files already to process'
            except:
                # run so can update again with a removed dataset recall that jobs with the
                # old dataset only created when run called.
                t.run()
                assert len(tr.getJobs()), "No Jobs created upon run()"
                job = GPI.jobs(int(tr.getJobs()[0].fqid.split('.')[0]))
                sleep_until_state(job, 300, 'submitted')
                del tr._impl.query.dataset.files[0]
                tr.update(True)

                # Check the dead dataset is picked up
                assert len(tr._impl.removed_data.files
                           ), "Didn\'t Pick up loss of a dataset"
                job.remove()
Exemple #2
0
    def getJobs(self):
        """ Get the job slice of all jobs that process this task """
        jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id))
        for trf in self.transforms:
            for jid in trf.getJobs():
                jobslice.objects[GPI.jobs(jid).fqid] = stripProxy(GPI.jobs(jid))

        return JobRegistrySliceProxy(jobslice)
Exemple #3
0
    def getJobs(self):
        """ Get the job slice of all jobs that process this task """
        jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id))
        for trf in self.transforms:
            for jid in trf.getJobs():
                jobslice.objects[GPI.jobs(jid).fqid] = stripProxy(
                    GPI.jobs(jid))

        return JobRegistrySliceProxy(jobslice)
Exemple #4
0
 def getTransform(self):
     tid = self.tasks_id.split(":")
     if len(tid) == 2 and tid[0].isdigit() and tid[1].isdigit():
         try:
             task = GPI.tasks(int(tid[0]))
         except KeyError:
             return None
         if task:
             return task.transforms[int(tid[1])]
     if len(tid) == 3 and tid[1].isdigit() and tid[2].isdigit():
         task = GPI.tasks(int(tid[1]))
         if task:
             return task.transforms[int(tid[2])]
     return None
 def getTransform(self):
     tid = self.tasks_id.split(":")
     if len(tid) == 2 and tid[0].isdigit() and tid[1].isdigit():
         try:
             task = GPI.tasks(int(tid[0]))
         except KeyError:
             return None
         if task:
             return task.transforms[int(tid[1])]
     if len(tid) == 3 and tid[1].isdigit() and tid[2].isdigit():
         task = GPI.tasks(int(tid[1]))
         if task:
             return task.transforms[int(tid[2])]
     return None
Exemple #6
0
    def test_appendTransform(self):
        from Ganga import GPI
        tr1 = GPI.LHCbTransform(application=DaVinci(), backend=Local())
        t = GPI.LHCbTask()

        # Try appending
        t.appendTransform(tr1)
        assert len(t.transforms), 'Didn\'t append a transform properly'

        # Try appending a transform with a query and check for update
        tr2 = GPI.LHCbTransform(application=DaVinci(), backend=Local())
        tr2.addQuery(GPI.BKTestQuery(stripping15up))
        t.appendTransform(tr2)
        assert len(t.transforms[-1]._impl.toProcess_dataset.files
                   ), 'Transform not updated properly after appending'
Exemple #7
0
    def createNewJob(self):
        """Create any jobs required for this unit"""
        j = GPI.Job()
        j._impl.backend = self._getParent().backend.clone()
        j._impl.application = self._getParent().application.clone()
        if not self.inputdata == None:
            j.inputdata = self.inputdata.clone()

        trf = self._getParent()
        task = trf._getParent()

        # copy across the outputfiles
        for f in trf.outputfiles:
            j.outputfiles += [f.clone()]

        j.inputsandbox = trf.inputsandbox

        # Sort out the splitter
        if trf.splitter:
            j.splitter = trf.splitter.clone()

        # Postprocessors
        for pp in trf.postprocessors:
            j.postprocessors.append(deepcopy(pp))

        return j
Exemple #8
0
    def copyOutput(self):
        """Copy the output data to local storage"""

        job = GPI.jobs(self.active_job_ids[0])

        if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset":
            logger.error(
                "Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer."
                % (job.outputdata._name, self.copy_output._name))
            return False

        # check which fies still need downloading
        to_download = []
        for f in job.outputfiles:

            # check for REs
            if self.copy_output.isValid(os.path.join(
                    f.localDir,
                    f.namePattern)) and not self.copy_output.isDownloaded(
                        os.path.join(f.localDir, f.namePattern)):
                to_download.append(f)

        # is everything downloaded?
        if len(to_download) == 0:
            return True

        # nope, so pick the requested number and off we go
        for f in to_download:
            f.get()

        return False
Exemple #9
0
    def copyOutput(self):
        """Copy the output data to local storage"""

        job = GPI.jobs(self.active_job_ids[0])

        if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset":
            logger.error(
                "Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer."
                % (job.outputdata._name, self.copy_output._name)
            )
            return False

        # check which fies still need downloading
        to_download = []
        for f in job.outputfiles:

            # check for REs
            if self.copy_output.isValid(os.path.join(f.localDir, f.namePattern)) and not self.copy_output.isDownloaded(
                os.path.join(f.localDir, f.namePattern)
            ):
                to_download.append(f)

        # is everything downloaded?
        if len(to_download) == 0:
            return True

        # nope, so pick the requested number and off we go
        for f in to_download:
            f.get()

        return False
Exemple #10
0
    def n_all(self):
        total = 0
        for jid in self.active_job_ids:

            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("n_all Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache() and 'subjobs:status' in j.getNodeIndexCache():
                if len(j.getNodeIndexCache()['subjobs:status']) != 0:
                    total += len(j.getNodeIndexCache()['subjobs:status'])
                else:
                    total += 1
            else:
                #logger.warning("WARNING: (status check) No index cache for job object %d" % jid)
                if j.subjobs:
                    total = len(j.subjobs)
                else:
                    total = 1

        return total
Exemple #11
0
    def n_all(self):
        total = 0
        for jid in self.active_job_ids:

            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("n_all Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache(
            ) and 'subjobs:status' in j.getNodeIndexCache():
                if len(j.getNodeIndexCache()['subjobs:status']) != 0:
                    total += len(j.getNodeIndexCache()['subjobs:status'])
                else:
                    total += 1
            else:
                #logger.warning("WARNING: (status check) No index cache for job object %d" % jid)
                if j.subjobs:
                    total = len(j.subjobs)
                else:
                    total = 1

        return total
Exemple #12
0
    def createNewJob(self):
        """Create any jobs required for this unit"""
        j = GPI.Job()

        j.backend = self._getParent().backend.clone()

        # copy form ourselves or the parent transform depending on what's
        # specified
        fields = [
            'application', 'splitter', 'inputfiles', 'inputdata',
            'inputsandbox', 'outputfiles', 'postprocessors'
        ]

        for f in fields:

            if (f == "postprocessors"
                    and len(getattr(self, f).process_objects) > 0):
                j.postprocessors = copy.deepcopy(addProxy(self).postprocessors)
            elif (f != "postprocessors" and getattr(self, f)):
                setattr(j, f, copy.deepcopy(getattr(self, f)))
            elif (f == "postprocessors"
                  and len(getattr(self._getParent(), f).process_objects) > 0):
                j.postprocessors = copy.deepcopy(
                    addProxy(self._getParent()).postprocessors)
            elif (f != "postprocessors" and getattr(self._getParent(), f)):
                setattr(j, f, copy.deepcopy(getattr(self._getParent(), f)))

        return j
Exemple #13
0
   def checkOutputContainers(self):
      """Go through all completed units and make sure datasets are registered as required"""
      logger.info("Cleaning out transform %d container..." % self.getID())

      try:
         dslist = []
         dq2_lock.acquire()
         try:
            dslist = dq2.listDatasetsInContainer(self.getContainerName())
         except:
            dslist = []

         try:
            dq2.deleteDatasetsFromContainer(self.getContainerName(), dslist )

         except DQContainerDoesNotHaveDataset:
            pass
         except Exception as x:
            logger.error("Problem cleaning out Transform container: %s %s", x.__class__, x)
         except DQException as x:
            logger.error('DQ2 Problem cleaning out Transform container: %s %s' %( x.__class__, x))
      finally:
         dq2_lock.release()

      logger.info("Checking output data has been registered for Transform %d..." % self.getID())
      for unit in self.units:
         
         if len(unit.active_job_ids) == 0:
            continue

         if unit.status == "completed" and GPI.jobs(unit.active_job_ids[0]).outputdata and GPI.jobs(unit.active_job_ids[0]).outputdata._impl._name == "DQ2OutputDataset":
            logger.info("Checking containers in Unit %d..." % unit.getID() )
            unit.registerDataset()            
    def test_update(self):
        from Ganga import GPI
        t = GPI.LHCbTask()
        tr = GPI.LHCbTransform(application=DaVinci(), backend=Dirac())
        t.appendTransform(tr)
        try:
            bkQueryList = [GPI.BKTestQuery(stripping20up)]
            tr.updateQuery()
            assert false, 'Should have thrown exception if updated with no query'
        except:
            tr.addQuery(GPI.BKTestQuery(stripping20down))

            # Check some new data added
            assert len(tr.inputdata), 'No data added after call to update'

            try:
                # Shouldn't allow a second update before processed the data in
                # toProcess_dataset
                tr.updateQuery()
                assert false, 'Should have thrown an error if updated with files already to process'
            except:
                # run so can update again with a removed dataset recall that jobs with the
                # old dataset only created when run called.
                t.run()
                assert len(tr.getJobs()), "No Jobs created upon run()"
                job = GPI.jobs(int(tr.getJobs()[0].fqid.split('.')[0]))
                sleep_until_state(job, 300, 'submitted')
                del tr._impl.query.dataset.files[0]
                tr.update(True)

                # Check the dead dataset is picked up
                assert len(
                    tr._impl.removed_data.files), "Didn\'t Pick up loss of a dataset"
                job.remove()
Exemple #15
0
 def test_OptionsFileSplitter_split(self):
     splitter = GPI.OptionsFileSplitter()
     splitter.optsArray = ['dummy1.opt', 'dummy2.opt', 'dummy3.opt']
     job = Job(application=DaVinci())
     job.prepare()
     #job.application.extra = GaudiExtras()
     subjobs = stripProxy(splitter).split(job)
     assert len(subjobs) == 3, 'incorrect number of subjobs'
Exemple #16
0
   def updateStatus(self, status):
      """Update status hook"""

      # register the dataset if applicable
      if status == "completed":
         job = GPI.jobs(self.active_job_ids[0])
         if job.outputdata and job.outputdata._impl._name == "DQ2OutputDataset" and not self.registerDataset():
            return
         
      super(AtlasUnit,self).updateStatus(status)
Exemple #17
0
    def test_update(self):
        from Ganga import GPI
        t = GPI.LHCbTask()
        tr1 = GPI.LHCbTransform(application=DaVinci(), backend=Local())
        tr2 = GPI.LHCbTransform(application=DaVinci(), backend=Local())
        t.appendTransform(tr1)
        t.appendTransform(tr2)
        tr1.addQuery(GPI.BKTestQuery(stripping15up))
        tr2.addQuery(GPI.BKTestQuery(stripping15down))

        # Check that update produces some files to process over multiple
        # transforms
        t.update()
        assert len(
            t.transforms[0]._impl.toProcess_dataset.files
        ), 'Update did not produce any datafiles to process in transform 0'
        assert len(
            t.transforms[1]._impl.toProcess_dataset.files
        ), 'Update did not produce any datafiles to process in transform 1'
Exemple #18
0
 def removeUnusedJobs(self):
     """Remove all jobs that aren't being used, e.g. failed jobs"""
     for unit in self.units:
         for jid in unit.prev_job_ids:
             try:
                 logger.warning("Removing job '%d'..." % jid)
                 job = GPI.jobs(jid)
                 job.remove()
             except:
                 logger.error("Problem removing job '%d'" % jid)
Exemple #19
0
   def updateStatus(self, status):
      """Update status hook"""

      # register the dataset if applicable
      if status == "completed":
         job = GPI.jobs(self.active_job_ids[0])
         if job.outputdata and job.outputdata._impl._name == "DQ2OutputDataset" and not self.registerDataset():
            return
         
      super(AtlasUnit,self).updateStatus(status)
Exemple #20
0
    def checkOutputContainers(self):
        """Go through all completed units and make sure datasets are registered as required"""
        logger.info("Cleaning out transform %d container..." % self.getID())

        try:
            dslist = []
            dq2_lock.acquire()
            try:
                dslist = dq2.listDatasetsInContainer(self.getContainerName())
            except:
                dslist = []

            try:
                dq2.deleteDatasetsFromContainer(self.getContainerName(),
                                                dslist)

            except DQContainerDoesNotHaveDataset:
                pass
            except Exception as x:
                logger.error("Problem cleaning out Transform container: %s %s",
                             x.__class__, x)
            except DQException as x:
                logger.error(
                    'DQ2 Problem cleaning out Transform container: %s %s' %
                    (x.__class__, x))
        finally:
            dq2_lock.release()

        logger.info(
            "Checking output data has been registered for Transform %d..." %
            self.getID())
        for unit in self.units:

            if len(unit.active_job_ids) == 0:
                continue

            if unit.status == "completed" and GPI.jobs(
                    unit.active_job_ids[0]).outputdata and GPI.jobs(
                        unit.active_job_ids[0]
                    ).outputdata._impl._name == "DQ2OutputDataset":
                logger.info("Checking containers in Unit %d..." % unit.getID())
                unit.registerDataset()
 def _getPartitionMasterJob(self, partition):
     """Get the master job from any number of partition jobs."""
     partition_jobs = self.getPartitionJobs(partition)  # only call method once
     if not len(partition_jobs):
         raise GangaException(None, "Cant get partition master job when NO jobs assigned to partition")
     elif len(partition_jobs) is 1:
         return partition_jobs[0]
     # Need registry access here might be better to get registry directly
     # as in prepared stuff, see Executable for example or even
     # tasksregistry.py!
     return GPI.jobs(partition_jobs[0].fqid.split(".")[0])
Exemple #22
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPI import TaskChainInput
                if isType(
                        inds, TaskChainInput
                ) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = GPI.jobs(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(
                                        pat,
                                        f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
Exemple #23
0
    def getParentUnitJobs(self, parent_units, include_subjobs=True):
        """Return the list of parent jobs"""
        job_list = []
        for parent in parent_units:
            job = GPI.jobs(parent.active_job_ids[0])
            if job.subjobs:
                job_list += job.subjobs
            else:
                job_list += [job]

        return job_list
Exemple #24
0
 def removeUnusedJobs(self):
     """Remove all jobs that aren't being used, e.g. failed jobs"""
     for unit in self.units:
         for jid in unit.prev_job_ids:
             try:
                 logger.warning("Removing job '%d'..." % jid)
                 job = GPI.jobs(jid)
                 job.remove()
             except Exception as err:
                 logger.debug("removeUnused: %s" % str(err))
                 logger.error("Problem removing job '%d'" % jid)
Exemple #25
0
    def getParentUnitJobs(self, parent_units, include_subjobs=True):
        """Return the list of parent jobs"""
        job_list = []
        for parent in parent_units:
            job = GPI.jobs(parent.active_job_ids[0])
            if job.subjobs:
                job_list += job.subjobs
            else:
                job_list += [job]

        return job_list
Exemple #26
0
    def remove(self, remove_jobs="do_nothing"):
        """Delete the task"""

        # make sure the task isn't running
        if self.status.find("running") != -1:
            logger.error(
                "Task is still running. Please pause before removing!")
            return

        if not remove_jobs in [True, False]:
            logger.info("You want to remove the task %i named '%s'." %
                        (self.id, self.name))
            logger.info(
                "Since this operation cannot be easily undone, please call this command again:")
            logger.info(
                " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id))
            logger.info(
                " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id))
            return
        if remove_jobs:

            for trf in self.transforms:
                for unit in trf.units:
                    for jid in unit.active_job_ids:
                        try:
                            j = GPI.jobs(jid)
                            j.remove()
                        except Exception as err:
                            logger.debug("Remove Err: %s" % str(err))
                            pass

                    for jid in unit.prev_job_ids:
                        try:
                            j = GPI.jobs(jid)
                            j.remove()
                        except Exception as err2:
                            logger.debug("Remove Err2: %s" % str(err2))
                            pass

        self._getRegistry()._remove(self)
        logger.info("Task #%s deleted" % self.id)
Exemple #27
0
    def updateStatus(self, status):
        """Update status hook"""

        # check for input data deletion of chain data
        if status == "completed" and self._getParent().delete_chain_input and len(self.req_units) > 0:

            # the inputdata field *must* be filled from the parent task
            # NOTE: When changing to inputfiles, will probably need to check
            # for any specified in trf.inputfiles

            # check that the parent replicas have been copied by checking
            # backend status == Done
            job_list = []
            for req_unit in self.req_units:
                trf = self._getParent()._getParent().transforms[
                    int(req_unit.split(":")[0])]
                req_unit_id = req_unit.split(":")[1]

                if req_unit_id != "ALL":
                    unit = trf.units[int(req_unit_id)]
                    job_list.append(GPI.jobs(unit.active_job_ids[0]))
                else:
                    for unit in trf.units:
                        job_list.append(GPI.jobs(unit.active_job_ids[0]))

            for j in job_list:
                if j.subjobs:
                    for sj in j.subjobs:
                        if sj.backend.status != "Done":
                            return
                else:
                    if j.backend.status != "Done":
                        return

            job = GPI.jobs(self.active_job_ids[0])
            for f in job.inputdata.files:
                logger.warning(
                    "Removing chain inputdata file '%s'..." % f.name)
                f.remove()

        super(LHCbUnit, self).updateStatus(status)
Exemple #28
0
    def checkForResubmission(self):
        """check if this unit should be resubmitted"""

        # check if we already have a job
        if len(self.active_job_ids) == 0:
            return False
        else:
            job = GPI.jobs(self.active_job_ids[0])
            if job.status in ["failed", "killed"]:
                return True

            return False
Exemple #29
0
    def test_addQuery(self):
        from Ganga import GPI
        tr = GPI.LHCbTransform(application=DaVinci(), backend=Local())
        t = GPI.LHCbTask()

        # Check non-lists and adding query to transform and non-associated
        t.addQuery(tr, GPI.BKTestQuery(stripping15up))
        assert len(t.transforms), 'Transform not associated correctly'
        assert t.transforms[0].queries[
            0].path == stripping15up, 'Query path not correctly assigned'

        # Check duplicating
        t.addQuery(tr, bkQueryList)
        assert len(
            t.transforms) == 4, 'Problem duplicating and appending transforms'
        tmpList = [
            stripping15up, stripping15down, stripping16up, stripping16down
        ]
        for tran in t.transforms:
            assert tran.queries[
                0].path in tmpList, 'Query attribute not setup properly for all transforms'
Exemple #30
0
    def checkForResubmission(self):
        """check if this unit should be resubmitted"""

        # check if we already have a job
        if len(self.active_job_ids) == 0:
            return False
        else:
            job = GPI.jobs(self.active_job_ids[0])
            if job.status in ["failed", "killed"]:
                return True

            return False
Exemple #31
0
 def test_GaussSplitter_split(self):
     job = Job(application=Gauss())
     job.application.platform = 'x86_64-slc6-gcc48-opt'
     f = open('this-is-not-a-file.opts', 'w')
     f.write('')
     f.close()
     job.application.optsfile = 'this-is-not-a-file.opts'  # hack for Gauss
     stripProxy(job.application).master_configure()
     job.prepare()
     gsplit = GPI.GaussSplitter(eventsPerJob=1, numberOfJobs=3)
     subjobs = stripProxy(gsplit).split(job)
     assert len(subjobs) == 3, 'incorrect # of jobs'
Exemple #32
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPI import TaskChainInput
                if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = GPI.jobs(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(pat, f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
   def createNewJob(self):
      """Create any jobs required for this unit"""      
      j = GPI.Job()
      j._impl.backend = self._getParent().backend.clone()
      j._impl.application = self._getParent().application.clone()
      j.inputdata = self.inputdata.clone()

      trf = self._getParent()
      task = trf._getParent()

      # copy across the outputfiles
      for f in trf.outputfiles:
         j.outputfiles += [f.clone()]

      j.inputsandbox = trf.inputsandbox

      if type(self.eventswanted) == type(''):
        subLines = self.eventswanted
      else:
        subLines = '\n'.join(self.eventswanted)
      # Base for the naming of each subjob's CSV file
      incsvfile = j._impl.application.csvfile
      tmpname = os.path.basename(incsvfile)
      if len(tmpname.split('.')) > 1:
        patterncsv = '.'.join(tmpname.split('.')[0:-1])+"_sub%d."+ tmpname.split('.')[-1]
      else:
        patterncsv = tmpname+"_sub%d"

      from Ganga.GPIDev.Lib.File import FileBuffer
      thiscsv = patterncsv % self.subpartid

      # Create the CSV file for this Unit
      j._impl.getInputWorkspace().writefile(FileBuffer(thiscsv,subLines),executable=0)
      j._impl.application.csvfile = j._impl.getInputWorkspace().getPath()+thiscsv
      j.inputsandbox.append(j._impl.getInputWorkspace().getPath()+thiscsv)

      # Base for the naming of each subjob's output file
      tmpname = os.path.basename(j._impl.application.outputfile)
      if len(tmpname.split('.')) > 1:
        patternout = '.'.join(tmpname.split('.')[0:-1])+"_sub%d."+ tmpname.split('.')[-1]
      else:
        patternout = tmpname+"_sub%d"
      j._impl.application.outputfile = patternout % self.subpartid

      # Sort out the splitter
      if trf.splitter:
         j.splitter = trf.splitter.clone()
         
      return j
 def _getPartitionMasterJob(self, partition):
     """Get the master job from any number of partition jobs."""
     partition_jobs = self.getPartitionJobs(
         partition)  # only call method once
     if not len(partition_jobs):
         raise GangaException(
             None,
             'Cant get partition master job when NO jobs assigned to partition'
         )
     elif len(partition_jobs) is 1:
         return partition_jobs[0]
     # Need registry access here might be better to get registry directly
     # as in prepared stuff, see Executable for example or even
     # tasksregistry.py!
     return GPI.jobs(partition_jobs[0].fqid.split('.')[0])
Exemple #35
0
 def createNewJob(self, partition):
     """ Returns a new job initialized with the transforms application, backend and name """
     task = self._getParent(
     )  # this works because createNewJob is only called by a task
     id = task.transforms.index(self)
     j = GPI.Job()
     stripProxy(j).backend = self.backend.clone()
     stripProxy(j).application = self.application.clone()
     stripProxy(j).application.tasks_id = "%i:%i" % (task.id, id)
     stripProxy(j).application.id = self.getNewAppID(partition)
     j.inputdata = self.inputdata
     j.outputdata = self.outputdata
     j.inputsandbox = self.inputsandbox
     j.outputsandbox = self.outputsandbox
     j.name = "T%i:%i C%i" % (task.id, id, partition)
     return j
Exemple #36
0
    def n_active(self):

        if self.status == 'completed':
            return 0

        tot_active = 0
        active_states = ['submitted', 'running']

        for jid in self.active_job_ids:

            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("n_active Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache(
            ) and 'subjobs:status' in j.getNodeIndexCache():
                if len(j.getNodeIndexCache()['subjobs:status']) > 0:
                    for sj_stat in j.getNodeIndexCache()['subjobs:status']:
                        if sj_stat in active_states:
                            tot_active += 1
                else:
                    if j.getNodeIndexCache()['status'] in active_states:
                        tot_active += 1
            else:
                #logger.warning("WARNING: (active check) No index cache for job object %d" % jid)
                if j.status in active_states:
                    if j.subjobs:
                        for sj in j.subjobs:
                            if sj.status in active_states:
                                tot_active += 1
                    else:
                        tot_active += 1

        return tot_active
Exemple #37
0
    def n_active(self):

        if self.status == 'completed':
            return 0

        tot_active = 0
        active_states = ['submitted', 'running']

        for jid in self.active_job_ids:

            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("n_active Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, 'getNodeIndexCache') and j.getNodeIndexCache() and 'subjobs:status' in j.getNodeIndexCache():
                if len(j.getNodeIndexCache()['subjobs:status']) > 0:
                    for sj_stat in j.getNodeIndexCache()['subjobs:status']:
                        if sj_stat in active_states:
                            tot_active += 1
                else:
                    if j.getNodeIndexCache()['status'] in active_states:
                        tot_active += 1
            else:
                #logger.warning("WARNING: (active check) No index cache for job object %d" % jid)
                if j.status in active_states:
                    if j.subjobs:
                        for sj in j.subjobs:
                            if sj.status in active_states:
                                tot_active += 1
                    else:
                        tot_active += 1

        return tot_active
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create a chained unit using the output data from the given units"""

        # check all parent units for copy_output
        copy_output_ok = True
        for parent in parent_units:
            if not parent.copy_output:
                copy_output_ok = False

        # all parent units must be completed so the outputfiles are filled correctly
        for parent in parent_units:
            if parent.status != "completed":
                return None

        if not use_copy_output or not copy_output_ok:
            unit = ND280Unit_CSVEvtList()
            unit.inputdata = ND280LocalDataset()
            for parent in parent_units:
                # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK!
                job = GPI.jobs(parent.active_job_ids[0])
                for f in job.outputfiles:
                    # should check for different file types and add them as appropriate to the dataset
                    # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this
                    # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles!
                    unit.inputdata.names.append(
                        os.path.join(job.outputdir, f.namePattern))
        else:

            unit = ND280Unit_CSVEvtList()
            unit.inputdata = ND280LocalDataset()

            for parent in parent_units:
                # unit needs to have completed and downloaded before we can get file list
                if parent.status != "completed":
                    return None

                # we should be OK so copy all output to the dataset
                for f in parent.copy_output.files:
                    unit.inputdata.names.append(
                        os.path.join(parent.copy_output.local_location, f))

        return unit
 def createNewJob(self, partition):
     """ Returns a new job initialized with the transforms application, backend and name """
     j = GPI.Job()
     stripProxy(j).backend = self.backend.clone()
     stripProxy(j).application = self.application.clone()
     stripProxy(j).application.tasks_id = "%i:%i" % (self.task_id,
                                                     self.transform_id)
     stripProxy(j).application.id = self.getNewAppID(partition)
     if self.splitter is not None:
         stripProxy(j).splitter = LHCbTaskDummySplitter(self.splitter)
     # if self.merger is not None:
     # stripProxy(j).merger = self.merger
     j.inputdata = self.toProcess_dataset
     j.outputdata = self.outputdata
     j.inputsandbox = self.inputsandbox
     j.outputsandbox = self.outputsandbox
     j.name = "T%i Tr%i P%i" % (self.task_id, self.transform_id, partition)
     j.do_auto_resubmit = True
     self.toProcess_dataset.files = []
     return j
Exemple #40
0
    def removeUnusedData(self):
        """Remove any output data from orphaned jobs"""
        for unit in self.units:
            for jid in unit.prev_job_ids:
                try:
                    logger.warning("Removing data from job '%d'..." % jid)
                    job = GPI.jobs(jid)

                    jlist = []
                    if len(job.subjobs) > 0:
                        jlist = job.subjobs
                    else:
                        jlist = [job]

                    for sj in jlist:
                        for f in sj.outputfiles:
                            if isType(f, DiracFile) == "DiracFile" and f.lfn:
                                f.remove()
                except:
                    logger.error("Problem deleting data for job '%d'" % jid)
                    pass
Exemple #41
0
    def removeUnusedData(self):
        """Remove any output data from orphaned jobs"""
        for unit in self.units:
            for jid in unit.prev_job_ids:
                try:
                    logger.warning("Removing data from job '%d'..." % jid)
                    job = GPI.jobs(jid)

                    jlist = []
                    if len(job.subjobs) > 0:
                        jlist = job.subjobs
                    else:
                        jlist = [job]

                    for sj in jlist:
                        for f in sj.outputfiles:
                            if isType(f, DiracFile) == "DiracFile" and f.lfn:
                                f.remove()
                except:
                    logger.error("Problem deleting data for job '%d'" % jid)
                    pass
   def createChainUnit( self, parent_units, use_copy_output = True ):
      """Create a chained unit using the output data from the given units"""

      # check all parent units for copy_output
      copy_output_ok = True
      for parent in parent_units:
         if not parent.copy_output:
            copy_output_ok = False

      # all parent units must be completed so the outputfiles are filled correctly
      for parent in parent_units:
         if parent.status != "completed":
               return None

      if not use_copy_output or not copy_output_ok:
         unit = ND280Unit_CSVEvtList()
         unit.inputdata = ND280LocalDataset()
         for parent in parent_units:
            # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK!
            job = GPI.jobs(parent.active_job_ids[0])
            for f in job.outputfiles:
               # should check for different file types and add them as appropriate to the dataset
               # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this
               # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles!
               unit.inputdata.names.append( os.path.join( job.outputdir, f.namePattern ) )
      else:

         unit = ND280Unit_CSVEvtList()
         unit.inputdata = ND280LocalDataset()

         for parent in parent_units:
            # unit needs to have completed and downloaded before we can get file list
            if parent.status != "completed":
               return None

            # we should be OK so copy all output to the dataset
            for f in parent.copy_output.files:
               unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) )
         
      return unit
Exemple #43
0
    def n_status(self, status):
        tot_active = 0
        for jid in self.active_job_ids:

            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("n_status Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache:
                if len(j._index_cache['subjobs:status']) > 0:
                    for sj_stat in j._index_cache['subjobs:status']:
                        if sj_stat == status:
                            tot_active += 1
                else:
                    if j._index_cache['status'] == status:
                        tot_active += 1

            else:
                #logger.warning("WARNING: (status check) No index cache for job object %d" % jid)
                if j.subjobs:
                    for sj in j.subjobs:
                        if sj.status == status:
                            tot_active += 1
                else:
                    if j.status == status:
                        tot_active += 1

        return tot_active
Exemple #44
0
   def getContainerList(self):
      """Return a list of the output containers assocaited with this unit"""
      job = GPI.jobs(self.active_job_ids[0])
      cont_list = []
      if job.backend._impl._name == "Jedi":
         # Jedi jobs have their datasets stored in datasetList
         for ds in job.outputdata.datasetList:
            cont_list.append(ds)

      elif job.backend.individualOutDS:
         # find all the individual out ds's
         for ds in job.subjobs(0).outputdata.output:

            # find all containers listed
            for cont_name in ds.split(","):
               if not cont_name.endswith("/"):
                  continue

               if not cont_name in cont_list:
                  cont_list.append(cont_name)
      else:
         cont_list.append(job.outputdata.datasetname)

      return cont_list
Exemple #45
0
   def getContainerList(self):
      """Return a list of the output containers assocaited with this unit"""
      job = GPI.jobs(self.active_job_ids[0])
      cont_list = []
      if job.backend._impl._name == "Jedi":
         # Jedi jobs have their datasets stored in datasetList
         for ds in job.outputdata.datasetList:
            cont_list.append(ds)

      elif job.backend.individualOutDS:
         # find all the individual out ds's
         for ds in job.subjobs(0).outputdata.output:

            # find all containers listed
            for cont_name in ds.split(","):
               if not cont_name.endswith("/"):
                  continue

               if not cont_name in cont_list:
                  cont_list.append(cont_name)
      else:
         cont_list.append(job.outputdata.datasetname)

      return cont_list
Exemple #46
0
   def createChainUnit( self, parent_units, use_copy_output = True ):
      """Create an output unit given this output data"""
      
      # we need valid parent jobs
      for parent in parent_units:
         # need datasetname filled for Panda jobs
         if len(parent.active_job_ids) == 0 or \
                (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \
                 GPI.jobs(parent.active_job_ids[0]).outputdata and \
                 GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Panda" and \
                 GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname == ""):
            return None

         # need datasetList filled for Jedi jobs
         if len(parent.active_job_ids) == 0 or \
                (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \
                 GPI.jobs(parent.active_job_ids[0]).outputdata and \
                 GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi" and \
                 len(GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList) == 0):
            return None

         # for local jobs, make sure units are complete
         if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset" and \
                parent.status != "completed":
            return None
                

      # Are we doing Local -> Local? i.e. are we going from ATLASOutputDataset?
      # Problem: Doesn't take into account merger locations...
      if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset":
         unit = AtlasUnit()
         unit.inputdata = ATLASLocalDataset()

         for parent in parent_units:
            for l in GPI.jobs(parent.active_job_ids[0]).outputdata.output:
               unit.inputdata.names += l
            
      # should we use the copy_output (ie. local output). Special case for TagPrepare
      elif GPI.jobs(parent_units[0].active_job_ids[0]).application._impl._name == "TagPrepare":
         
         # make sure all have completed before taking the tag-info
         if parent_units[0].status != "completed":
            return None
         
         unit = AtlasUnit()
         unit.inputdata = DQ2Dataset()
         unit.inputdata.tag_info = GPI.jobs(parent_units[0].active_job_ids[0]).application.tag_info
         
      elif not use_copy_output or not parent.copy_output:
         unit = AtlasUnit()
         unit.inputdata = DQ2Dataset()
         ds_list = []
         for parent in parent_units:
            
            # Don't just use the main datasetname as Jedi introduces separate containers for logs and output files
            if GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi":
               for ds in GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList:
                  if not ds.endswith(".log/"):
                     unit.inputdata.dataset.append( ds )
            else:
               unit.inputdata.dataset.append( GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname )
         
      else:

         unit = AtlasUnit()
         unit.inputdata = ATLASLocalDataset()

         for parent in parent_units:
            # unit needs to have completed and downloaded
            if parent.status != "completed":
               return None

            # we should be OK so copy all output to an ATLASLocalDataset
            for f in parent.copy_output.files:
               unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) )
         
      return unit
Exemple #47
0
   def copyOutput(self):
      """Copy the output data to local storage"""

      job = GPI.jobs(self.active_job_ids[0])
      
      if self.copy_output._name != "TaskLocalCopy" or job.outputdata._impl._name != "DQ2OutputDataset":
         logger.error("Cannot transfer from DS type '%s' to '%s'. Please contact plugin developer." % (job.outputdata._name, self.copy_output._name))
         return False

      # get list of output files
      self._acquireDownloadLock()
      dq2_list = []
      if len(self.output_file_list) == 0:
         for ds in self.getOutputDatasetList():
            dq2_list = dq2.listFilesInDataset(ds)
            
            # merge job DSs leave empty non-merged DSs around
            if job.backend.__class__.__name__ == "Panda" and job.backend.requirements.enableMerge and not ds.endswith("merge") and len(dq2_list) == 0:
               continue

            for guid in dq2_list[0].keys():
               self.output_file_list[ dq2_list[0][guid]['lfn'] ] = ds
         
      # check which ones still need downloading
      to_download = {}
      for f in self.output_file_list.keys():
         
         # check for REs
         if self.copy_output.isValid(f) and not self.copy_output.isDownloaded(f):            
            to_download[ f ] = self.output_file_list[f]

      # store download location in case it's changed while downloading
      download_loc = self.copy_output.local_location
      self._releaseDownloadLock()

      # is everything downloaded?
      if len(to_download.keys()) == 0:
         return True

      # nope, so pick the requested number and off we go
      thread_array = []
      for fname in to_download.keys()[:self._getParent().num_dq2_threads]:
         dsname = to_download[fname]
         exe = 'dq2-get -L ROAMING -a -d -H %s -f %s %s' % (download_loc, fname, dsname)
         logger.info("Downloading '%s' to %s..." % (fname, download_loc))

         thread = Download.download_dq2(exe)
         thread.start()
         thread_array.append(thread)

      for t in thread_array:
         t.join()

      self._acquireDownloadLock()
      
      # check for valid download - SHOULD REALLY BE A HASH CHECK
      for fname in to_download.keys()[:self._getParent().num_dq2_threads]:
         full_path = os.path.join(self.copy_output.local_location, fname)
         if not os.path.exists(full_path):
            logger.error("Error downloading '%s'. File doesn't exist after download." % full_path)
         elif os.path.getsize( full_path ) < 4:
            logger.error("Error downloading '%s'. File size smaller than 4 bytes (%d)" % (full_path, os.path.getsize( full_path ) ))
         else:
            self.copy_output.files.append(fname)
            logger.info("File '%s' downloaded successfully" % full_path)

      self._releaseDownloadLock()

      return False
Exemple #48
0
   def registerDataset(self):
      """Register in the transform container"""
      trf = self._getParent()
      trf_container = trf.getContainerName()

      fail = False
      try:
         containerinfo = {}
         dq2_lock.acquire()
         try:
            containerinfo = dq2.listDatasets(trf_container)
         except:
            containerinfo = {}
            
         if containerinfo == {}:
            try:
               dq2.registerContainer(trf_container)
               logger.info('Registered container for Unit %i of Transform %i: %s' % (self.getID(), trf.getID(), trf_container))
               
            except Exception as x:
               logger.error('Problem registering container for Unit %i of Transform %i, %s : %s %s' % (self.getID(), trf.getID(), trf_container,x.__class__, x))
               fail = True
            except DQException as x:
               logger.error('DQ2 Problem registering container for Unit %i of Transform %i, %s : %s %s' % (self.getID(), trf.getID(), trf_container,x.__class__, x))
               fail = True
               
         job = GPI.jobs(self.active_job_ids[0])
         ds_list = self.getOutputDatasetList()

         for ds in ds_list:
            try:
               dq2.registerDatasetsInContainer(trf_container, [ ds ] )
            except DQContainerAlreadyHasDataset:
               pass
            except Exception as x:
               logger.error('Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, trf_container, x.__class__, x))
               fail = True
            except DQException as x:
               logger.error('DQ2 Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, trf_container, x.__class__, x))
               fail = True
      finally:
         dq2_lock.release()
         
      if fail:
         return not fail
      
      # add dataset to the task container
      task = trf._getParent()
      task_container = task.getContainerName()
      
      try:
         containerinfo = {}
         dq2_lock.acquire()
         try:
            containerinfo = dq2.listDatasets(task_container)
         except:
            containerinfo = {}
         if containerinfo == {}:
            try:
               dq2.registerContainer(task_container)
               logger.info('Registered container for Unit %i of Transform %i: %s' % (self.getID(), trf.getID(), task_container))
                  
            except Exception as x:
               logger.error('Problem registering container for Unit %i of Transform %i in Task %i, %s : %s %s' %
                            (self.getID(), trf.getID(), task.getID(), task_container, x.__class__, x))
               fail = True
            except DQException as x:
               logger.error('DQ2 Problem registering container for Unit %i of Transform %i in Task %i, %s : %s %s' %
                            (self.getID(), trf.getID(), task.getID(), task_container, x.__class__, x))
               fail = True 

         ds_list = self.getOutputDatasetList()

         for ds in ds_list:
            try:
               dq2.registerDatasetsInContainer(task_container, [ ds ] )
            except DQContainerAlreadyHasDataset:
               pass
            except Exception as x:
               logger.error('Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, task_container, x.__class__, x))
               fail = True
            except DQException as x:
               logger.error('DQ2 Problem registering dataset %s in container %s: %s %s' %( job.outputdata.datasetname, task_container, x.__class__, x))
               fail = True
      finally:
          dq2_lock.release()

      return not fail
Exemple #49
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""
        #logger.warning("Entered Unit %d update function..." % self.getID())

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString(self, "Creating Job...")
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString(self,
                                  "Attempting job submission with queues...")
                    GPI.queues.add(j.submit)
                else:
                    addInfoString(self, "Attempting job submission...")
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString(self, "Failed Job Submission")
                addInfoString(self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error(
                        "Too many resubmits (%i). Deactivating unit." %
                        (self.minor_resub_count + self.major_resub_count))
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit."
                            % self.minor_resub_count)
                        addInfoString(
                            self,
                            "Deactivating unit. Too many resubmits (%i)" %
                            (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." %
                        self.major_resub_count)
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString(self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString(self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Exemple #50
0
   def createChainUnit( self, parent_units, use_copy_output = True ):
      """Create a chained unit using the output data from the given units"""

      # check all parent units for copy_output
      copy_output_ok = True
      for parent in parent_units:
         if not parent.copy_output:
            copy_output_ok = False

      # all parent units must be completed so the outputfiles are filled correctly
      for parent in parent_units:
         if parent.status != "completed":
           return None

      if len(parent_units) == 0:
         return None

      if not use_copy_output or not copy_output_ok:
         unit = ND280Unit()
         unit.inputdata = ND280LocalDataset()
         for parent in parent_units:
            # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK!
            job = GPI.jobs(parent.active_job_ids[0])

            # if TaskChainInput.include_file_mask is not used go old way (see below)
            # otherwise add all file matching include_file_mask(s) to the unit.inputdata. DV.
            inc_file_mask = False
            for p in self.inputdata[0].include_file_mask:
               unit.inputdata.get_dataset(job.outputdir, p)
               inc_file_mask = True

            if not inc_file_mask:
               for f in job.outputfiles:
                  # should check for different file types and add them as appropriate to the dataset
                  # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this
                  # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles!
                  # TODO: implement use of include/exclude_file_mask
                  #       
                  try:
                     outputfilenameformat = f.outputfilenameformat
                  except:
                     inputdir = job.outputdir
                  else:
                     #### WARNING: The following will work only if the MassStorageFile puts the files in local directories !
                     inputdir = '/'.join( [getConfig('Output')['MassStorageFile']['uploadOptions']['path'], f.outputfilenameformat.replace('{fname}','')])
                  unit.inputdata.get_dataset( inputdir, f.namePattern )
      else:

         unit = ND280Unit()
         unit.inputdata = ND280LocalDataset()

         for parent in parent_units:
            # unit needs to have completed and downloaded before we can get file list
            if parent.status != "completed":
               return None

            # we should be OK so copy all output to the dataset
            for f in parent.copy_output.files:
               unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) )
         
      return unit
Exemple #51
0
    def updateQuery(self, resubmit=False):
        """Update the dataset information of the transforms. This will
        include any new data in the processing or re-run jobs that have data which
        has been removed."""
        if len(self.queries) == 0:
            raise GangaException(
                None, 'Cannot call updateQuery() on an LHCbTransform without any queries')

        if self._getParent() != None:
            logger.info('Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (
                self._getParent().id, self.getID()))
        else:
            logger.info(
                'Retrieving latest bookkeeping information for transform, please wait...')

        # check we have an input DS per BK Query
        while len(self.queries) > len(self.inputdata):
            self.inputdata.append(LHCbDataset())

        # loop over the queries and add fill file lists
        for id, query in enumerate(self.queries):

            # Get the latest dataset
            latest_dataset = query.getDataset()

            # Compare to previous inputdata, get new and removed
            logger.info(
                'Checking for new and removed data for query %d, please wait...' % self.queries.index(query))
            dead_data = LHCbDataset()
            new_data = LHCbDataset()

            # loop over the old data and compare
            new_data.files += latest_dataset.difference(
                self.inputdata[id]).files
            dead_data.files += self.inputdata[
                id].difference(latest_dataset).files

            # for dead data, find then kill/remove any associated jobs
            # loop over units and check any associated with this DS
            # TODO: Follow through chained tasks
            for unit in self.units:
                # associted unit
                if unit.input_datset_index != id:
                    continue

                # find the job
                if len(unit.active_job_ids) == 0:
                    continue

                # check the data
                for f in dead_data.files:
                    if f in unit.inputdata.files:

                        # kill the job
                        job = GPI.jobs(unit.active_job_ids[0])
                        if job.status in ['submitted', 'running']:
                            job.kill()

                        # forget the job
                        unit.prev_job_ids.append(unit.active_job_ids[0])
                        unit.active_job_ids = []
                        break

            # in any case, now just set the DS files to the new set
            self.inputdata[id].files = []
            self.inputdata[id].files = latest_dataset.files
Exemple #52
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""
        #logger.warning("Entered Unit %d update function..." % self.getID())

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString( self, "Creating Job..." )
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString( self, "Attempting job submission with queues..." )
                    GPI.queues.add(j.submit)
                else:
                    addInfoString( self, "Attempting job submission..." )
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString( self, "Failed Job Submission")
                addInfoString( self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error("Too many resubmits (%i). Deactivating unit." % (
                        self.minor_resub_count + self.major_resub_count))
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count)
                        addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count)
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString( self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString( self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Exemple #53
0
 def initialize(self):
     from Ganga import GPI
     self.backend = stripProxy(GPI.Local())
Exemple #54
0
   def unregisterDataset(self):
      """Register in the transform container"""
      trf = self._getParent()
      trf_container = trf.getContainerName()
      fail = False
      try:
         containerinfo = {}
         dq2_lock.acquire()
         try:
            containerinfo = dq2.listDatasets(trf_container)
         except:
            containerinfo = {}
            
         if containerinfo != {}:
            job = GPI.jobs(self.active_job_ids[0])
            ds_list = self.getOutputDatasetList()
            for ds in ds_list:
               
               try:
                  dq2.deleteDatasetsFromContainer(trf_container, [ ds ] )
               except DQContainerDoesNotHaveDataset:
                  pass
               except Exception as x:
                  logger.error('Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, trf_container, x.__class__, x))
                  fail = True
               except DQException as x:
                  logger.error('DQ2 Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, trf_container, x.__class__, x))
                  fail = True
      finally:
         dq2_lock.release()

      if fail:
         return not fail
      
      # add dataset to the task container
      task = trf._getParent()
      task_container = task.getContainerName()

      try:
         containerinfo = {}
         dq2_lock.acquire()
         try:
            containerinfo = dq2.listDatasets(task_container)
         except:
            containerinfo = {}
            
         if containerinfo != {}:
            job = GPI.jobs(self.active_job_ids[0])
            ds_list = self.getOutputDatasetList()
            for ds in ds_list:
               
               try:
                  dq2.deleteDatasetsFromContainer(task_container, [ ds ] )
               except DQContainerDoesNotHaveDataset:
                  pass
               except Exception as x:
                  logger.error('Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, task_container, x.__class__, x))
                  fail = True
               except DQException as x:
                  logger.error('DQ2 Problem removing dataset %s from container %s: %s %s' %( j.outputdata.datasetname, task_container, x.__class__, x))
                  fail = True
      finally:
         dq2_lock.release()

      return not fail
Exemple #55
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need valid parent jobs
        for parent in parent_units:
            # need datasetname filled for Panda jobs
            if len(parent.active_job_ids) == 0 or \
                   (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \
                    GPI.jobs(parent.active_job_ids[0]).outputdata and \
                    GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Panda" and \
                    GPI.jobs(parent.active_job_ids[0]).outputdata.datasetname == ""):
                return None

            # need datasetList filled for Jedi jobs
            if len(parent.active_job_ids) == 0 or \
                   (GPI.jobs(parent.active_job_ids[0]).application._impl._name != "TagPrepare" and \
                    GPI.jobs(parent.active_job_ids[0]).outputdata and \
                    GPI.jobs(parent.active_job_ids[0]).backend._impl._name == "Jedi" and \
                    len(GPI.jobs(parent.active_job_ids[0]).outputdata.datasetList) == 0):
                return None

            # for local jobs, make sure units are complete
            if GPI.jobs(parent_units[0].active_job_ids[0]).outputdata._impl._name == "ATLASOutputDataset" and \
                   parent.status != "completed":
                return None

        # Are we doing Local -> Local? i.e. are we going from ATLASOutputDataset?
        # Problem: Doesn't take into account merger locations...
        if GPI.jobs(parent_units[0].active_job_ids[0]
                    ).outputdata._impl._name == "ATLASOutputDataset":
            unit = AtlasUnit()
            unit.inputdata = ATLASLocalDataset()

            for parent in parent_units:
                for l in GPI.jobs(parent.active_job_ids[0]).outputdata.output:
                    unit.inputdata.names += l

        # should we use the copy_output (ie. local output). Special case for TagPrepare
        elif GPI.jobs(parent_units[0].active_job_ids[0]
                      ).application._impl._name == "TagPrepare":

            # make sure all have completed before taking the tag-info
            if parent_units[0].status != "completed":
                return None

            unit = AtlasUnit()
            unit.inputdata = DQ2Dataset()
            unit.inputdata.tag_info = GPI.jobs(
                parent_units[0].active_job_ids[0]).application.tag_info

        elif not use_copy_output or not parent.copy_output:
            unit = AtlasUnit()
            unit.inputdata = DQ2Dataset()
            ds_list = []
            for parent in parent_units:

                # Don't just use the main datasetname as Jedi introduces separate containers for logs and output files
                if GPI.jobs(parent.active_job_ids[0]
                            ).backend._impl._name == "Jedi":
                    for ds in GPI.jobs(
                            parent.active_job_ids[0]).outputdata.datasetList:
                        if not ds.endswith(".log/"):
                            unit.inputdata.dataset.append(ds)
                else:
                    unit.inputdata.dataset.append(
                        GPI.jobs(
                            parent.active_job_ids[0]).outputdata.datasetname)

        else:

            unit = AtlasUnit()
            unit.inputdata = ATLASLocalDataset()

            for parent in parent_units:
                # unit needs to have completed and downloaded
                if parent.status != "completed":
                    return None

                # we should be OK so copy all output to an ATLASLocalDataset
                for f in parent.copy_output.files:
                    unit.inputdata.names.append(
                        os.path.join(parent.copy_output.local_location, f))

        return unit