Exemple #1
0
    def updateStatus(self, status):
        "if we've just been switched to running, the job has been submitted so update the DB"
        if (status
                == "running") and (self.status
                                   == "new") and len(self.active_job_ids) != 0:
            ins_fields = "run,description,decay_type,radcor,mc_version,seed,events,output_name,jdl,mac,exe,stderr,stdout,status_url,submitter,submitted_on,status"
            #(submitted_on format is "2013-MM-DD HH:mm:ss")
            app = getJobByID(self.active_job_ids[0]).application
            ins_vals = "%d, '%s production job', %d, %d, %d, %d, %d, '%s_v%d_r%d.root', '__jdlfile__', '%s', '%s', 'na62run%d.err', 'na62run%d.out', '%s', 'ganga', '%s', 'SUBMITTED'" % (
                app.run_number, app.decay_name, app.decay_type, app.radcor,
                app.mc_version, app.run_number, app.num_events,
                app.file_prefix, app.mc_version, app.run_number,
                app._impl.getMACFileName(), app.script_name, app.run_number,
                app.run_number, getJobByID(self.active_job_ids[0]).backend.id,
                datetime.now().strftime("%Y-%m-%d %H:%m:%S"))

            nec_file = ".gpytho"
            work_dir = "/clusterhome/home/protopop"
            nec_str = open(os.path.join(work_dir,
                                        nec_file)).read().strip().strip('#')
            mysqlc = "mysql -hhughnon.ppe.gla.ac.uk -ugridbot -p%s -s GridJobs" % nec_str

            rc, out = getstatusoutput(
                "echo \"INSERT INTO jobs (%s) VALUES (%s)\" | %s" %
                (ins_fields, ins_vals, mysqlc))

            if (rc != 0):
                logger.error(out)
Exemple #2
0
    def getJobs(self):
        """ Get the job slice of all jobs that process this task """
        jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id))
        for trf in self.transforms:
            for jid in trf.getJobs():
                jobslice.objects[getJobByID(jid).fqid] = stripProxy(getJobByID(jid))

        return JobRegistrySliceProxy(jobslice)
Exemple #3
0
    def getJobs(self):
        """ Get the job slice of all jobs that process this task """
        jobslice = JobRegistrySlice("tasks(%i).getJobs()" % (self.id))
        for trf in self.transforms:
            for jid in trf.getJobs():
                jobslice.objects[getJobByID(jid).fqid] = stripProxy(
                    getJobByID(jid))

        return JobRegistrySliceProxy(jobslice)
Exemple #4
0
    def n_all(self):
        total = 0
        for jid in self.active_job_ids:

            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("n_all Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(
                    j, '_index_cache'
            ) and j._index_cache and 'subjobs:status' in j._index_cache:
                if len(j._index_cache['subjobs:status']) != 0:
                    total += len(j._index_cache['subjobs:status'])
                else:
                    total += 1
            else:
                #logger.warning("WARNING: (status check) No index cache for job object %d" % jid)
                if j.subjobs:
                    total = len(j.subjobs)
                else:
                    total = 1

        return total
Exemple #5
0
    def n_all(self):
        total = 0
        for jid in self.active_job_ids:

            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("n_all Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache:
                if len(j._index_cache['subjobs:status']) != 0:
                    total += len(j._index_cache['subjobs:status'])
                else:
                    total += 1
            else:
                #logger.warning("WARNING: (status check) No index cache for job object %d" % jid)
                if j.subjobs:
                    total = len(j.subjobs)
                else:
                    total = 1

        return total
Exemple #6
0
    def updateStatus(self, status):
        """Update status hook"""

        # check for input data deletion of chain data
        if status == "completed" and self._getParent(
        ).delete_chain_input and len(self.req_units) > 0:

            # the inputdata field *must* be filled from the parent task
            # NOTE: When changing to inputfiles, will probably need to check
            # for any specified in trf.inputfiles

            # check that the parent replicas have been copied by checking
            # backend status == Done
            job_list = []
            for req_unit in self.req_units:
                trf = self._getParent()._getParent().transforms[int(
                    req_unit.split(":")[0])]
                req_unit_id = req_unit.split(":")[1]

                if req_unit_id != "ALL":
                    unit = trf.units[int(req_unit_id)]
                    job_list.append(getJobByID(unit.active_job_ids[0]))
                else:
                    for unit in trf.units:
                        job_list.append(getJobByID(unit.active_job_ids[0]))

            for j in job_list:
                if j.subjobs:
                    for sj in j.subjobs:
                        if sj.backend.status != "Done":
                            return
                else:
                    if j.backend.status != "Done":
                        return

            job = getJobByID(self.active_job_ids[0])
            for f in job.inputdata.files:
                # check for an lfn
                if hasattr(f, "lfn"):
                    fname = f.lfn
                else:
                    fname = f.namePattern

                logger.warning("Removing chain inputdata file '%s'..." % fname)
                f.remove()

        super(LHCbUnit, self).updateStatus(status)
Exemple #7
0
    def remove(self, remove_jobs="do_nothing"):
        """Delete the task"""

        # make sure the task isn't running
        if self.status.find("running") != -1:
            logger.error(
                "Task is still running. Please pause before removing!")
            return

        if not remove_jobs in [True, False]:
            logger.info("You want to remove the task %i named '%s'." %
                        (self.id, self.name))
            logger.info(
                "Since this operation cannot be easily undone, please call this command again:"
            )
            logger.info(
                " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs,"
                % (self.id))
            logger.info(
                " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs."
                % (self.id))
            return
        if remove_jobs:

            for trf in self.transforms:
                for unit in trf.units:
                    for jid in unit.active_job_ids:
                        try:
                            j = getJobByID(jid)
                            j.remove()
                        except Exception as err:
                            logger.debug("Remove Err: %s" % str(err))
                            pass

                    for jid in unit.prev_job_ids:
                        try:
                            j = getJobByID(jid)
                            j.remove()
                        except Exception as err2:
                            logger.debug("Remove Err2: %s" % str(err2))
                            pass

        self._getRegistry()._remove(self, auto_removed=1)
        logger.info("Task #%s deleted" % self.id)
Exemple #8
0
    def getParentUnitJobs(self, parent_units, include_subjobs=True):
        """Return the list of parent jobs"""
        job_list = []
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list += job.subjobs
            else:
                job_list += [job]

        return job_list
Exemple #9
0
 def removeUnusedJobs(self):
     """Remove all jobs that aren't being used, e.g. failed jobs"""
     for unit in self.units:
         for jid in unit.prev_job_ids:
             try:
                 logger.warning("Removing job '%d'..." % jid)
                 job = getJobByID(jid)
                 job.remove()
             except Exception as err:
                 logger.debug("removeUnused: %s" % str(err))
                 logger.error("Problem removing job '%d'" % jid)
Exemple #10
0
    def getParentUnitJobs(self, parent_units, include_subjobs=True):
        """Return the list of parent jobs"""
        job_list = []
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list += job.subjobs
            else:
                job_list += [job]

        return job_list
Exemple #11
0
 def removeUnusedJobs(self):
     """Remove all jobs that aren't being used, e.g. failed jobs"""
     for unit in self.units:
         for jid in unit.prev_job_ids:
             try:
                 logger.warning("Removing job '%d'..." % jid)
                 job = getJobByID(jid)
                 job.remove()
             except Exception as err:
                 logger.debug("removeUnused: %s" % str(err))
                 logger.error("Problem removing job '%d'" % jid)
Exemple #12
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
                if isType(
                        inds, TaskChainInput
                ) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(
                                        pat,
                                        f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
Exemple #13
0
    def remove(self, remove_jobs="do_nothing"):
        """Delete the task"""

        # make sure the task isn't running
        if self.status.find("running") != -1:
            logger.error(
                "Task is still running. Please pause before removing!")
            return

        if not remove_jobs in [True, False]:
            logger.info("You want to remove the task %i named '%s'." %
                        (self.id, self.name))
            logger.info(
                "Since this operation cannot be easily undone, please call this command again:")
            logger.info(
                " * as tasks(%i).remove(remove_jobs=True) if you want to remove all associated jobs," % (self.id))
            logger.info(
                " * as tasks(%i).remove(remove_jobs=False) if you want to keep the jobs." % (self.id))
            return
        if remove_jobs:

            for trf in self.transforms:
                for unit in trf.units:
                    for jid in unit.active_job_ids:
                        try:
                            j = getJobByID(jid)
                            j.remove()
                        except Exception as err:
                            logger.debug("Remove Err: %s" % str(err))
                            pass

                    for jid in unit.prev_job_ids:
                        try:
                            j = getJobByID(jid)
                            j.remove()
                        except Exception as err2:
                            logger.debug("Remove Err2: %s" % str(err2))
                            pass

        self._getRegistry()._remove(self, auto_removed=1)
        logger.info("Task #%s deleted" % self.id)
Exemple #14
0
    def checkForResubmission(self):
        """check if this unit should be resubmitted"""

        # check if we already have a job
        if len(self.active_job_ids) == 0:
            return False
        else:
            job = getJobByID(self.active_job_ids[0])
            if job.status in ["failed", "killed"]:
                return True

            return False
Exemple #15
0
    def checkForResubmission(self):
        """check if this unit should be resubmitted"""

        # check if we already have a job
        if len(self.active_job_ids) == 0:
            return False
        else:
            job = getJobByID(self.active_job_ids[0])
            if job.status in ["failed", "killed"]:
                return True

            return False
Exemple #16
0
   def createChainUnit( self, parent_units, use_copy_output = True ):
      """Create a chained unit using the output data from the given units"""

      # check all parent units for copy_output
      copy_output_ok = True
      for parent in parent_units:
         if not parent.copy_output:
            copy_output_ok = False

      # all parent units must be completed so the outputfiles are filled correctly
      for parent in parent_units:
         if parent.status != "completed":
           return None

      if len(parent_units) == 0:
         return None

      if not use_copy_output or not copy_output_ok:
         unit = ND280Unit()
         unit.inputdata = ND280LocalDataset()
         for parent in parent_units:
            # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK!
            job = getJobByID(parent.active_job_ids[0])
            for f in job.outputfiles:
               # should check for different file types and add them as appropriate to the dataset
               # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this
               # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles!
               # TODO: implement use of include/exclude_file_mask
               #       
               try:
                 outputfilenameformat = f.outputfilenameformat
               except:
                 inputdir = job.outputdir
               else:
                 #### WARNING: The following will work only if the MassStorageFile puts the files in local directories !
                 inputdir = '/'.join( [getConfig('Output')['MassStorageFile']['uploadOptions']['path'], f.outputfilenameformat.replace('{fname}','')])
               unit.inputdata.get_dataset( inputdir, f.namePattern )
      else:

         unit = ND280Unit()
         unit.inputdata = ND280LocalDataset()

         for parent in parent_units:
            # unit needs to have completed and downloaded before we can get file list
            if parent.status != "completed":
               return None

            # we should be OK so copy all output to the dataset
            for f in parent.copy_output.files:
               unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) )
         
      return unit
Exemple #17
0
    def createChainUnit(self, parent_units, use_copy_output=True):
        """Create an output unit given this output data"""

        # we need a parent job that has completed to get the output files
        incl_pat_list = []
        excl_pat_list = []
        for parent in parent_units:
            if len(parent.active_job_ids) == 0 or parent.status != "completed":
                return None

            for inds in self.inputdata:
                from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
                if isType(inds, TaskChainInput) and inds.input_trf_id == parent._getParent().getID():
                    incl_pat_list += inds.include_file_mask
                    excl_pat_list += inds.exclude_file_mask

        # go over the output files and copy the appropriates over as input
        # files
        flist = []
        import re
        for parent in parent_units:
            job = getJobByID(parent.active_job_ids[0])
            if job.subjobs:
                job_list = job.subjobs
            else:
                job_list = [job]

            for sj in job_list:
                for f in sj.outputfiles:

                    # match any dirac files that are allowed in the file mask
                    if isType(f, DiracFile):
                        if len(incl_pat_list) > 0:
                            for pat in incl_pat_list:
                                if re.search(pat, f.lfn):
                                    flist.append("LFN:" + f.lfn)
                        else:
                            flist.append("LFN:" + f.lfn)

                        if len(excl_pat_list) > 0:
                            for pat in excl_pat_list:
                                if re.search(pat, f.lfn) and "LFN:" + f.lfn in flist:
                                    flist.remove("LFN:" + f.lfn)

        # just do one unit that uses all data
        unit = LHCbUnit()
        unit.name = "Unit %d" % len(self.units)
        unit.inputdata = LHCbDataset(files=[DiracFile(lfn=f) for f in flist])

        return unit
Exemple #18
0
    def updateStatus(self, status):
        "if we've just been switched to running, the job has been submitted so update the DB"
        if (status == "running") and (self.status == "new") and len(self.active_job_ids) != 0:
            ins_fields = "run,description,decay_type,radcor,mc_version,seed,events,output_name,jdl,mac,exe,stderr,stdout,status_url,submitter,submitted_on,status"
            # (submitted_on format is "2013-MM-DD HH:mm:ss")
            app = getJobByID(self.active_job_ids[0]).application
            ins_vals = (
                "%d, '%s production job', %d, %d, %d, %d, %d, '%s_v%d_r%d.root', '__jdlfile__', '%s', '%s', 'na62run%d.err', 'na62run%d.out', '%s', 'ganga', '%s', 'SUBMITTED'"
                % (
                    app.run_number,
                    app.decay_name,
                    app.decay_type,
                    app.radcor,
                    app.mc_version,
                    app.run_number,
                    app.num_events,
                    app.file_prefix,
                    app.mc_version,
                    app.run_number,
                    app._impl.getMACFileName(),
                    app.script_name,
                    app.run_number,
                    app.run_number,
                    getJobByID(self.active_job_ids[0]).backend.id,
                    datetime.now().strftime("%Y-%m-%d %H:%m:%S"),
                )
            )

            nec_file = ".gpytho"
            work_dir = "/clusterhome/home/protopop"
            nec_str = open(os.path.join(work_dir, nec_file)).read().strip().strip("#")
            mysqlc = "mysql -hhughnon.ppe.gla.ac.uk -ugridbot -p%s -s GridJobs" % nec_str

            rc, out = getstatusoutput('echo "INSERT INTO jobs (%s) VALUES (%s)" | %s' % (ins_fields, ins_vals, mysqlc))

            if rc != 0:
                logger.error(out)
Exemple #19
0
    def n_active(self):

        if self.status == 'completed':
            return 0

        tot_active = 0
        active_states = ['submitted', 'running']

        for jid in self.active_job_ids:

            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("n_active Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(
                    j, '_index_cache'
            ) and j._index_cache and 'subjobs:status' in j._index_cache:
                if len(j._index_cache['subjobs:status']) > 0:
                    for sj_stat in j._index_cache['subjobs:status']:
                        if sj_stat in active_states:
                            tot_active += 1
                else:
                    if j._index_cache['status'] in active_states:
                        tot_active += 1
            else:
                #logger.warning("WARNING: (active check) No index cache for job object %d" % jid)
                if j.status in active_states:
                    if j.subjobs:
                        for sj in j.subjobs:
                            if sj.status in active_states:
                                tot_active += 1
                    else:
                        tot_active += 1

        return tot_active
Exemple #20
0
    def n_active(self):

        if self.status == 'completed':
            return 0

        tot_active = 0
        active_states = ['submitted', 'running']

        for jid in self.active_job_ids:

            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("n_active Err: %s" % str(err))
                task = self._getParent()._getParent()
                trf = self._getParent()
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            j = stripProxy(job)

            # try to preserve lazy loading
            if hasattr(j, '_index_cache') and j._index_cache and 'subjobs:status' in j._index_cache:
                if len(j._index_cache['subjobs:status']) > 0:
                    for sj_stat in j._index_cache['subjobs:status']:
                        if sj_stat in active_states:
                            tot_active += 1
                else:
                    if j._index_cache['status'] in active_states:
                        tot_active += 1
            else:
                #logger.warning("WARNING: (active check) No index cache for job object %d" % jid)
                if j.status in active_states:
                    if j.subjobs:
                        for sj in j.subjobs:
                            if sj.status in active_states:
                                tot_active += 1
                    else:
                        tot_active += 1

        return tot_active
Exemple #21
0
    def removeUnusedData(self):
        """Remove any output data from orphaned jobs"""
        for unit in self.units:
            for jid in unit.prev_job_ids:
                try:
                    logger.warning("Removing data from job '%d'..." % jid)
                    job = getJobByID(jid)

                    jlist = []
                    if len(job.subjobs) > 0:
                        jlist = job.subjobs
                    else:
                        jlist = [job]

                    for sj in jlist:
                        for f in sj.outputfiles:
                            if isType(f, DiracFile) == "DiracFile" and f.lfn:
                                f.remove()
                except:
                    logger.error("Problem deleting data for job '%d'" % jid)
                    pass
Exemple #22
0
    def removeUnusedData(self):
        """Remove any output data from orphaned jobs"""
        for unit in self.units:
            for jid in unit.prev_job_ids:
                try:
                    logger.warning("Removing data from job '%d'..." % jid)
                    job = getJobByID(jid)

                    jlist = []
                    if len(job.subjobs) > 0:
                        jlist = job.subjobs
                    else:
                        jlist = [job]

                    for sj in jlist:
                        for f in sj.outputfiles:
                            if isType(f, DiracFile) == "DiracFile" and f.lfn:
                                f.remove()
                except:
                    logger.error("Problem deleting data for job '%d'" % jid)
                    pass
   def createChainUnit( self, parent_units, use_copy_output = True ):
      """Create a chained unit using the output data from the given units"""

      # check all parent units for copy_output
      copy_output_ok = True
      for parent in parent_units:
         if not parent.copy_output:
            copy_output_ok = False

      # all parent units must be completed so the outputfiles are filled correctly
      for parent in parent_units:
         if parent.status != "completed":
               return None

      if not use_copy_output or not copy_output_ok:
         unit = ND280Unit_CSVEvtList()
         unit.inputdata = ND280LocalDataset()
         for parent in parent_units:
            # loop over the output files and add them to the ND280LocalDataset - THIS MIGHT NEED SOME WORK!
            job = getJobByID(parent.active_job_ids[0])
            for f in job.outputfiles:
               # should check for different file types and add them as appropriate to the dataset
               # self.inputdata (== TaskChainInput).include/exclude_file_mask could help with this
               # This will be A LOT easier with Ganga 6.1 as you can easily map outputfiles -> inputfiles!
               unit.inputdata.names.append( os.path.join( job.outputdir, f.namePattern ) )
      else:

         unit = ND280Unit_CSVEvtList()
         unit.inputdata = ND280LocalDataset()

         for parent in parent_units:
            # unit needs to have completed and downloaded before we can get file list
            if parent.status != "completed":
               return None

            # we should be OK so copy all output to the dataset
            for f in parent.copy_output.files:
               unit.inputdata.names.append( os.path.join( parent.copy_output.local_location, f ) )
         
      return unit
Exemple #24
0
    def updateQuery(self, resubmit=False):
        """Update the dataset information of the transforms. This will
        include any new data in the processing or re-run jobs that have data which
        has been removed."""
        if len(self.queries) == 0:
            raise GangaException(
                None, 'Cannot call updateQuery() on an LHCbTransform without any queries')

        if self._getParent() != None:
            logger.info('Retrieving latest bookkeeping information for transform %i:%i, please wait...' % (
                self._getParent().id, self.getID()))
        else:
            logger.info(
                'Retrieving latest bookkeeping information for transform, please wait...')

        # check we have an input DS per BK Query
        while len(self.queries) > len(self.inputdata):
            self.inputdata.append(LHCbDataset())

        # loop over the queries and add fill file lists
        for id, query in enumerate(self.queries):

            # Get the latest dataset
            latest_dataset = query.getDataset()

            # Compare to previous inputdata, get new and removed
            logger.info(
                'Checking for new and removed data for query %d, please wait...' % self.queries.index(query))
            dead_data = LHCbDataset()
            new_data = LHCbDataset()

            # loop over the old data and compare
            new_data.files += latest_dataset.difference(
                self.inputdata[id]).files
            dead_data.files += self.inputdata[
                id].difference(latest_dataset).files

            # for dead data, find then kill/remove any associated jobs
            # loop over units and check any associated with this DS
            # TODO: Follow through chained tasks
            for unit in self.units:
                # associted unit
                if unit.input_datset_index != id:
                    continue

                # find the job
                if len(unit.active_job_ids) == 0:
                    continue

                # check the data
                for f in dead_data.files:
                    if f in unit.inputdata.files:

                        # kill the job
                        job = getJobByID(unit.active_job_ids[0])
                        if job.status in ['submitted', 'running']:
                            job.kill()

                        # forget the job
                        unit.prev_job_ids.append(unit.active_job_ids[0])
                        unit.active_job_ids = []
                        break

            # in any case, now just set the DS files to the new set
            self.inputdata[id].files = []
            self.inputdata[id].files = latest_dataset.files
Exemple #25
0
    def updateQuery(self, resubmit=False):
        """Update the dataset information of the transforms. This will
        include any new data in the processing or re-run jobs that have data which
        has been removed."""
        if len(self.queries) == 0:
            raise GangaException(
                None,
                'Cannot call updateQuery() on an LHCbTransform without any queries'
            )

        if self._getParent() != None:
            logger.info(
                'Retrieving latest bookkeeping information for transform %i:%i, please wait...'
                % (self._getParent().id, self.getID()))
        else:
            logger.info(
                'Retrieving latest bookkeeping information for transform, please wait...'
            )

        # check we have an input DS per BK Query
        while len(self.queries) > len(self.inputdata):
            self.inputdata.append(LHCbDataset())

        # loop over the queries and add fill file lists
        for id, query in enumerate(self.queries):

            # Get the latest dataset
            latest_dataset = query.getDataset()

            # Compare to previous inputdata, get new and removed
            logger.info(
                'Checking for new and removed data for query %d, please wait...'
                % self.queries.index(query))
            dead_data = LHCbDataset()
            new_data = LHCbDataset()

            # loop over the old data and compare
            new_data.files += latest_dataset.difference(
                self.inputdata[id]).files
            dead_data.files += self.inputdata[id].difference(
                latest_dataset).files

            # for dead data, find then kill/remove any associated jobs
            # loop over units and check any associated with this DS
            # TODO: Follow through chained tasks
            for unit in self.units:
                # associted unit
                if unit.input_datset_index != id:
                    continue

                # find the job
                if len(unit.active_job_ids) == 0:
                    continue

                # check the data
                for f in dead_data.files:
                    if f in unit.inputdata.files:

                        # kill the job
                        job = getJobByID(unit.active_job_ids[0])
                        if job.status in ['submitted', 'running']:
                            job.kill()

                        # forget the job
                        unit.prev_job_ids.append(unit.active_job_ids[0])
                        unit.active_job_ids = []
                        break

            # in any case, now just set the DS files to the new set
            self.inputdata[id].files = []
            self.inputdata[id].files = latest_dataset.files
Exemple #26
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString(self, "Creating Job...")
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString(self,
                                  "Attempting job submission with queues...")
                    from Ganga.Core.GangaThread.WorkerThreads import getQueues
                    getQueues().add(j.submit)
                else:
                    addInfoString(self, "Attempting job submission...")
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString(self, "Failed Job Submission")
                addInfoString(self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error(
                        "Too many resubmits (%i). Deactivating unit." %
                        (self.minor_resub_count + self.major_resub_count))
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit."
                            % self.minor_resub_count)
                        addInfoString(
                            self,
                            "Deactivating unit. Too many resubmits (%i)" %
                            (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." %
                        self.major_resub_count)
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString(self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString(self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Exemple #27
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString( self, "Creating Job..." )
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString( self, "Attempting job submission with queues..." )
                    from Ganga.Core.GangaThread.WorkerThreads import getQueues
                    getQueues().add(j.submit)
                else:
                    addInfoString( self, "Attempting job submission..." )
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString( self, "Failed Job Submission")
                addInfoString( self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error("Too many resubmits (%i). Deactivating unit." % (
                        self.minor_resub_count + self.major_resub_count))
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count)
                        addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count)
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString( self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString( self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1