Beispiel #1
0
 def minorResubmit(self, job):
     """perform just a minor resubmit"""
     try:
         trf = self._getParent()
     except Exception as err:
         logger.debug("GetParent exception!\n%s" % str(err))
         trf = None
     if trf is not None and trf.submit_with_threads:
         addInfoString( self, "Attempting job re-submission with queues..." )
         GPI.queues.add(job.resubmit)
     else:
         addInfoString( self, "Attempting job re-submission..." )
         job.resubmit()
Beispiel #2
0
 def minorResubmit(self, job):
     """perform just a minor resubmit"""
     try:
         trf = self._getParent()
     except Exception as err:
         logger.debug("GetParent exception!\n%s" % str(err))
         trf = None
     if trf is not None and trf.submit_with_threads:
         addInfoString(self, "Attempting job re-submission with queues...")
         GPI.queues.add(job.resubmit)
     else:
         addInfoString(self, "Attempting job re-submission...")
         job.resubmit()
Beispiel #3
0
    def addUnitToTRF(self, unit, prev_unit=None):
        """Add a unit to this Transform given the input and output data"""
        if not unit:
            raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name))

        addInfoString( self, "Adding Unit to TRF...")
        unit.updateStatus("hold")
        unit.active = True
        if prev_unit:
            unit.prev_job_ids += prev_unit.prev_job_ids
            self.units[prev_unit.getID()] = unit
        else:
            self.units.append(unit)
Beispiel #4
0
    def addUnitToTRF(self, unit, prev_unit=None):
        """Add a unit to this Transform given the input and output data"""
        if not unit:
            raise ApplicationConfigurationError(None, "addUnitTOTRF failed for Transform %d (%s): No unit specified" % (self.getID(), self.name))

        addInfoString( self, "Adding Unit to TRF...")
        unit.updateStatus("hold")
        unit.active = True
        if prev_unit:
            unit.prev_job_ids += prev_unit.prev_job_ids
            self.units[prev_unit.getID()] = unit
        else:
            self.units.append(unit)
            stripProxy(unit).id = len(self.units) - 1
Beispiel #5
0
    def reset(self):
        """Reset the unit completely"""
        addInfoString( self, "Reseting Unit...")
        self.minor_resub_count = 0
        self.major_resub_count = 0
        if len(self.active_job_ids) > 0:
            self.prev_job_ids += self.active_job_ids
        self.active_job_ids = []

        self.active = True

        # if has parents, set to recreate
        if len(self.req_units) > 0:
            self.updateStatus("recreating")
        else:
            self.updateStatus("running")
Beispiel #6
0
    def reset(self):
        """Reset the unit completely"""
        addInfoString(self, "Reseting Unit...")
        self.minor_resub_count = 0
        self.major_resub_count = 0
        if len(self.active_job_ids) > 0:
            self.prev_job_ids += self.active_job_ids
        self.active_job_ids = []

        self.active = True

        # if has parents, set to recreate
        if len(self.req_units) > 0:
            self.updateStatus("recreating")
        else:
            self.updateStatus("running")
Beispiel #7
0
    def resetUnit(self, uid):
        """Reset the given unit"""
        addInfoString( self, "Reseting Unit %i" % ( uid ) )

        for u in self.units:
            if u.getID() == uid:
                u.reset()
                break

        # find any chained units and mark for recreation
        for trf in self._getParent().transforms:
            for u2 in trf.units:
                for req in u2.req_units:
                    if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()):
                        trf.resetUnit(u2.getID())

        self.updateStatus("running")
Beispiel #8
0
    def resetUnit(self, uid):
        """Reset the given unit"""
        addInfoString( self, "Reseting Unit %i" % ( uid ) )

        for u in self.units:
            if u.getID() == uid:
                u.reset()
                break

        # find any chained units and mark for recreation
        for trf in self._getParent().transforms:
            for u2 in trf.units:
                for req in u2.req_units:
                    if req == "%d:%d" % (self.getID(), u.getID()) or req == "%d:ALL" % (self.getID()):
                        trf.resetUnit(u2.getID())

        self.updateStatus("running")
Beispiel #9
0
    def update(self):
        """Called by the parent task to check for status updates, submit jobs, etc."""
        if self.status == "pause" or self.status == "new":
            return 0

        # check for complete required units
        task = self._getParent()
        for trf_id in self.required_trfs:
            if task.transforms[trf_id].status != "completed":
                return 0

        # set the start time if not already set
        if len(self.required_trfs) > 0 and self.units[0].start_time == 0:
            for unit in self.units:
                unit.start_time = time.time() + self.chain_delay * 60 - 1

        # report the info for this transform
        unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 }
        for unit in self.units:
            unit_status[unit.status] += 1
         
        info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"],
                                                                                                              unit_status["running"], unit_status["completed"],
                                                                                                              unit_status["bad"], self._getParent().n_tosub())
      
        addInfoString(self, info_str)
                
        # ask the unit splitter if we should create any more units given the
        # current data
        self.createUnits()

        # loop over units and update them ((re)submits will be called here)
        old_status = self.status
        unit_status_list = []

        # find submissions first
        unit_update_list = []
        for unit in self.units:

            if not unit.checkForSubmission() and not unit.checkForResubmission():
                unit_update_list.append(unit)
                continue

            if unit.update() and self.abort_loop_on_submit:
                logger.info("Unit %d of transform %d, Task %d has aborted the loop" % (
                    unit.getID(), self.getID(), task.id))
                return 1

            unit_status_list.append(unit.status)

        # now check for download
        for unit in unit_update_list:
            if unit.update() and self.abort_loop_on_submit:
                logger.info("Unit %d of transform %d, Task %d has aborted the loop" % (
                    unit.getID(), self.getID(), task.id))
                return 1

            unit_status_list.append(unit.status)

        from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
        # check for any TaskChainInput completions
        for ds in self.inputdata:
            if isType(ds, TaskChainInput) and ds.input_trf_id != -1:
                if task.transforms[ds.input_trf_id].status != "completed":
                    return 0

        # update status and check
        for state in ['running', 'hold', 'bad', 'completed']:
            if state in unit_status_list:
                if state == 'hold':
                    state = "running"
                if state != self.status:
                    self.updateStatus(state)
                break
Beispiel #10
0
    def update(self):
        """Called by the parent task to check for status updates, submit jobs, etc."""
        if self.status == "pause" or self.status == "new":
            return 0

        # check for complete required units
        task = self._getParent()
        for trf_id in self.required_trfs:
            if task.transforms[trf_id].status != "completed":
                return 0

        # set the start time if not already set
        if len(self.required_trfs) > 0 and self.units[0].start_time == 0:
            for unit in self.units:
                unit.start_time = time.time() + self.chain_delay * 60 - 1

        # report the info for this transform
        unit_status = { "new":0, "hold":0, "running":0, "completed":0, "bad":0, "recreating":0 }
        for unit in self.units:
            unit_status[unit.status] += 1
         
        info_str = "Unit overview: %i units, %i new, %i hold, %i running, %i completed, %i bad. to_sub %i" % (len(self.units), unit_status["new"], unit_status["hold"],
                                                                                                              unit_status["running"], unit_status["completed"],
                                                                                                              unit_status["bad"], self._getParent().n_tosub())
      
        addInfoString(self, info_str)
                
        # ask the unit splitter if we should create any more units given the
        # current data
        self.createUnits()

        # loop over units and update them ((re)submits will be called here)
        old_status = self.status
        unit_status_list = []

        # find submissions first
        unit_update_list = []
        for unit in self.units:

            if not unit.checkForSubmission() and not unit.checkForResubmission():
                unit_update_list.append(unit)
                continue

            if unit.update() and self.abort_loop_on_submit:
                logger.info("Unit %d of transform %d, Task %d has aborted the loop" % (
                    unit.getID(), self.getID(), task.id))
                return 1

            unit_status_list.append(unit.status)

        # now check for download
        for unit in unit_update_list:
            if unit.update() and self.abort_loop_on_submit:
                logger.info("Unit %d of transform %d, Task %d has aborted the loop" % (
                    unit.getID(), self.getID(), task.id))
                return 1

            unit_status_list.append(unit.status)

        from Ganga.GPIDev.Lib.Tasks.TaskChainInput import TaskChainInput
        # check for any TaskChainInput completions
        for ds in self.inputdata:
            if isType(ds, TaskChainInput) and ds.input_trf_id != -1:
                if task.transforms[ds.input_trf_id].status != "completed":
                    return 0

        # update status and check
        for state in ['running', 'hold', 'bad', 'completed']:
            if state in unit_status_list:
                if state == 'hold':
                    state = "running"
                if state != self.status:
                    self.updateStatus(state)
                break
Beispiel #11
0
 def updateStatus(self, status):
     """Update status hook"""
     addInfoString(self, "Status change from '%s' to '%s'" % (self.status, status))
     self.status = status
Beispiel #12
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString( self, "Creating Job..." )
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString( self, "Attempting job submission with queues..." )
                    from Ganga.Core.GangaThread.WorkerThreads import getQueues
                    getQueues().add(j.submit)
                else:
                    addInfoString( self, "Attempting job submission..." )
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString( self, "Failed Job Submission")
                addInfoString( self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = getJobByID(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning("Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)" %
                               (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error("Too many resubmits (%i). Deactivating unit." % (
                        self.minor_resub_count + self.major_resub_count))
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % ( self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit." % self.minor_resub_count)
                        addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." % self.major_resub_count)
                    addInfoString( self, "Deactivating unit. Too many resubmits (%i)" % (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString( self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString( self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error("Couldn't resubmit the job. Deactivating unit.")
                        addInfoString( self, "Failed Job resubmission")
                        addInfoString( self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Beispiel #13
0
    def update(self):
        """Update the unit and (re)submit jobs as required"""
        #logger.warning("Entered Unit %d update function..." % self.getID())

        # if we're complete, then just return
        if self.status in ["completed", "recreating"] or not self.active:
            return 0

        # check if submission is needed
        task = self._getParent()._getParent()
        trf = self._getParent()
        maxsub = task.n_tosub()

        # check parent unit(s)
        req_ok = self.checkParentUnitsAreComplete()

        # set the start time if not already set
        if len(self.req_units) > 0 and req_ok and self.start_time == 0:
            self.start_time = time.time() + trf.chain_delay * 60 - 1

        if req_ok and self.checkForSubmission() and maxsub > 0:

            # create job and submit
            addInfoString(self, "Creating Job...")
            j = self.createNewJob()
            if j.name == '':
                j.name = "T%i:%i U%i" % (task.id, trf.getID(), self.getID())

            try:
                if trf.submit_with_threads:
                    addInfoString(self,
                                  "Attempting job submission with queues...")
                    GPI.queues.add(j.submit)
                else:
                    addInfoString(self, "Attempting job submission...")
                    j.submit()

            except Exception as err:
                logger.debug("update Err: %s" % str(err))
                addInfoString(self, "Failed Job Submission")
                addInfoString(self, "Reason: %s" % (formatTraceback()))
                logger.error("Couldn't submit the job. Deactivating unit.")
                self.prev_job_ids.append(j.id)
                self.active = False
                trf._setDirty()  # ensure everything's saved
                return 1

            self.active_job_ids.append(j.id)
            self.updateStatus("running")
            trf._setDirty()  # ensure everything's saved

            if trf.submit_with_threads:
                return 0

            return 1

        # update any active jobs
        for jid in self.active_job_ids:

            # we have an active job so see if this job is OK and resubmit if
            # not
            try:
                job = GPI.jobs(jid)
            except Exception as err:
                logger.debug("Update2 Err: %s" % str(err))
                logger.warning(
                    "Cannot find job with id %d. Maybe reset this unit with: tasks(%d).transforms[%d].resetUnit(%d)"
                    % (jid, task.id, trf.getID(), self.getID()))
                continue

            if job.status == "completed":

                # check if actually completed
                if not self.checkCompleted(job):
                    return 0

                # check for DS copy
                if trf.unit_copy_output:
                    if not self.copy_output:
                        trf.createUnitCopyOutputDS(self.getID())

                    if not self.copyOutput():
                        return 0

                # check for merger
                if trf.unit_merger:
                    if not self.merger:
                        self.merger = trf.createUnitMerger(self.getID())

                    if not self.merge():
                        return 0

                # all good so mark unit as completed
                self.updateStatus("completed")

            elif job.status == "failed" or job.status == "killed":

                # check for too many resubs
                if self.minor_resub_count + self.major_resub_count > trf.run_limit - 1:
                    logger.error(
                        "Too many resubmits (%i). Deactivating unit." %
                        (self.minor_resub_count + self.major_resub_count))
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                rebroker = False

                if self.minor_resub_count > trf.minor_run_limit - 1:
                    if self._getParent().rebroker_on_job_fail:
                        rebroker = True
                    else:
                        logger.error(
                            "Too many minor resubmits (%i). Deactivating unit."
                            % self.minor_resub_count)
                        addInfoString(
                            self,
                            "Deactivating unit. Too many resubmits (%i)" %
                            (self.minor_resub_count + self.minor_resub_count))
                        self.active = False
                        return 0

                if self.major_resub_count > trf.major_run_limit - 1:
                    logger.error(
                        "Too many major resubmits (%i). Deactivating unit." %
                        self.major_resub_count)
                    addInfoString(
                        self, "Deactivating unit. Too many resubmits (%i)" %
                        (self.minor_resub_count + self.major_resub_count))
                    self.active = False
                    return 0

                # check the type of resubmit
                if rebroker or self.checkMajorResubmit(job):

                    self.major_resub_count += 1
                    self.minor_resub_count = 0

                    try:
                        addInfoString(self, "Attempting major resubmit...")
                        self.majorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err3: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False

                    # break the loop now because we've probably changed the
                    # active jobs list
                    return 1
                else:
                    self.minor_resub_count += 1
                    try:
                        addInfoString(self, "Attempting minor resubmit...")
                        self.minorResubmit(job)
                    except Exception as err:
                        logger.debug("Update Err4: %s" % str(err))
                        logger.error(
                            "Couldn't resubmit the job. Deactivating unit.")
                        addInfoString(self, "Failed Job resubmission")
                        addInfoString(self, "Reason: %s" % (formatTraceback()))
                        self.active = False
                        return 1
Beispiel #14
0
 def updateStatus(self, status):
     """Update status hook"""
     addInfoString(
         self, "Status change from '%s' to '%s'" % (self.status, status))
     self.status = status