Exemple #1
0
 def GetNumSubmit(self, idleslots, idlejobs, idleuserjobs):
     """
     Calculate the number of glideins to submit.
     
     @param idleslots: Number of idle startd's
     @param idlejobs: Number of glideins in queue, but not active
     @param idleuserjobs: Number of idle user jobs from FLOCK_FROM
     
     @return: int - Number of glideins to submit
     """
     
     # If we have already submitted enough glideins to fufill the request,
     # don't submit more.
     if max([idlejobs, idleslots]) >= idleuserjobs:
         logging.debug("The number of idlejobs or idleslots fufills the requested idleuserjobs, not submitting any glideins")
         return 0
     
     status = ClusterStatus(status_constraint="IsUndefined(Offline)")
     
     # Check that running glideins are reporting to the collector
     running_glidein_jobs = status.GetRunningGlideinJobs()
     logging.debug("Number of running_glidein_jobs = %i", running_glidein_jobs)
     running_glideins = status.GetRunningGlideins()
     logging.debug("Number of running glideins = %i", running_glideins)
     
     if ((running_glidein_jobs * .9) > running_glideins):
         logging.error("I'm guessing glideins are not reporting to the collector, not submitting")
         return 0
     
     # Ok, so now submit until we can't submit any more, or there are less user jobs
     return min([int(get_option("maxqueuedjobs")) - idlejobs, \
                 idleuserjobs,\
                 int(get_option("MaxIdleGlideins")) - idleslots])
    def Stop(self):
        status = ClusterStatus()

        # Get the factory id
        factoryID = status.GetFactoryID()

        # Remove the factory job
        (stderr, stdout) = RunExternal("condor_rm %s" % factoryID)
        print "Stderr = %s" % stderr.strip()
Exemple #3
0
 def __init__(self, cluster_unique, useOffline = False):
     self.cluster_unique = cluster_unique
     
     self.status = ClusterStatus(status_constraint="IsUndefined(Offline) && BOSCOCluster =?= \"%s\"" % self.cluster_unique, queue_constraint = "BOSCOCluster =?= \"%s\"" % self.cluster_unique)
     self.useOffline = useOffline
     if useOffline:
         self.offline = OfflineAds()
     
     self.cluster_entry, self.cluster_type = self._ParseClusterId(cluster_unique)
     if self.cluster_type == None:
         self.cluster_type = "pbs"
    def Restart(self):
        status = ClusterStatus()

        # Get the factory id
        factoryID = status.GetFactoryID()

        # Hold then release the factory in the queue
        (stderr, stdout) = RunExternal("condor_hold %s" % factoryID)
        print "Stderr = %s" % stderr.strip()
        #print "Stdout = %s" % stdout.strip()

        (stderr, stdout) = RunExternal("condor_release %s" % factoryID)
        print "Stderr = %s" % stderr.strip()
    def Start(self):
        """ 
        Start the Factory 
        
        """
        self.Intialize()

        statuses = {}
        status = ClusterStatus(status_constraint="IsUndefined(Offline)")
        offline = OfflineAds()

        # First, daemonize?

        while 1:
            logging.info("Starting iteration...")

            # Check if there are any idle jobs
            if not self.UseOffline:
                user_idle = self.GetIdleJobs(ClusterStatus())
                if user_idle == None:
                    logging.info("Received None from idle jobs")
                    self.SleepFactory()
                    continue

                idleuserjobs = 0
                for user in user_idle.keys():
                    idleuserjobs += user_idle[user]

                logging.debug("Idle jobs = %i" % idleuserjobs)
                if idleuserjobs < 1:
                    logging.info("No idle jobs")
                    self.SleepFactory()
                    continue

            # For each ssh'd blahp
            for cluster in self.cluster_list:
                idleslots = idlejobs = 0

                if self.UseOffline:
                    idleuserjobs = cluster.GetIdleJobs()

                # Check if the cluster is able to submit jobs
                try:
                    (idleslots, idlejobs) = cluster.ClusterMeetPreferences()
                except ClusterPreferenceException, e:
                    logging.debug("Received error from ClusterMeetPreferences")
                    logging.debug(e)
                    idleslots = idlejobs = None

                # If the cluster preferences weren't met, then move on
                if idleslots == None or idlejobs == None:
                    continue

                # Get the offline ads to update.
                if self.UseOffline:
                    num_submit = cluster.GetIdleJobs()

                # Determine how many glideins to submit
                num_submit = self.GetNumSubmit(idleslots, idlejobs,
                                               idleuserjobs)
                logging.info("Submitting %i glidein jobs", num_submit)
                cluster.SubmitGlideins(num_submit)

            self.SleepFactory()
Exemple #6
0
class Cluster:
    def __init__(self, cluster_unique, useOffline=False):
        self.cluster_unique = cluster_unique

        self.status = ClusterStatus(
            status_constraint="IsUndefined(Offline) && BOSCOCluster =?= \"%s\""
            % self.cluster_unique,
            queue_constraint="BOSCOCluster =?= \"%s\"" % self.cluster_unique)
        self.useOffline = useOffline
        if useOffline:
            self.offline = OfflineAds()

        self.cluster_entry, self.cluster_type = self._ParseClusterId(
            cluster_unique)
        if self.cluster_type is None:
            self.cluster_type = "pbs"

    def get_option(self, option, default=None):

        return campus_factory.util.CampusConfig.get_option(
            option, default, self.cluster_unique)

    def _ParseClusterId(self, cluster_unique):
        """
        @param cluster_unique: Cluster unique string usually sent with bosco_cluster -l
        @return: ( cluster_entry, cluster_type )
        """

        # Line: [email protected]/pbs
        split_cluster = cluster_unique.split("/")
        if len(split_cluster) == 0:
            return (None, None)
        if len(split_cluster) == 1:
            return (split_cluster[0], None)
        elif len(split_cluster) == 2:
            return (split_cluster[0], split_cluster[1])
        else:
            logging.error("Unable to parse cluster id: %s" % cluster_unique)
            logging.error(
                "Going to just try using entry %s, with cluster type %s" %
                (split_cluster[0], split_cluster[1]))
            return (split_cluster[0], split_cluster[1])

    def ClusterMeetPreferences(self):
        idleslots = self.status.GetIdleGlideins()
        if idleslots is None:
            logging.info(
                "Received None from idle glideins, going to try later")
            raise ClusterPreferenceException(
                "Received None from idle glideins")
        logging.debug("Idle glideins = %i" % idleslots)
        if idleslots >= int(self.get_option("MAXIDLEGLIDEINS", "5")):
            logging.info("Too many idle glideins")
            raise ClusterPreferenceException("Too many idle glideins")

        # Check for idle glidein jobs
        idlejobs = self.status.GetIdleGlideinJobs()
        if idlejobs is None:
            logging.info(
                "Received None from idle glidein jobs, going to try later")
            raise ClusterPreferenceException(
                "Received None from idle glidein jobs")
        logging.debug("Queued jobs = %i" % idlejobs)
        if idlejobs >= int(self.get_option("maxqueuedjobs", "5")):
            logging.info("Too many queued jobs")
            raise ClusterPreferenceException("Too many queued jobs")

        # Check for held jobs
        heldjobs = self.status.GetHeldGlideins()
        if heldjobs is None:
            logging.info(
                "Received None from held glidein jobs, going to try later")
            raise ClusterPreferenceException(
                "Received None from held glidein jobs")
        logging.debug("Held jobs = %i" % heldjobs)
        if heldjobs >= int(self.get_option("maxheldjobs", "5")):
            logging.info("Too many held jobs for cluster %s" %
                         self.cluster_unique)
            raise ClusterPreferenceException("Too many held jobs")

        return (idleslots, idlejobs)

    def GetIdleJobs(self):
        if not self.useOffline:
            return 0

        # Update the offline cluster information
        toSubmit = self.offline.Update([self.cluster_unique])

        # Get the delinquent sites
        num_submit = self.offline.GetDelinquentSites([self.cluster_unique])
        logging.debug("toSubmit from offline %s", str(toSubmit))
        logging.debug("num_submit = %s\n", str(num_submit))

        if (len(toSubmit) > 0) or num_submit[self.cluster_unique]:
            idleuserjobs = max([num_submit[self.cluster_unique], 5])
            logging.debug(
                "Offline ads detected jobs should be submitted.  Idle user jobs set to %i",
                idleuserjobs)
        else:
            logging.debug(
                "Offline ads did not detect any matches or Delinquencies.")
            idleuserjobs = 0

        return toSubmit

    def SubmitGlideins(self, numSubmit):
        """
        Submit numSubmit glideins.
        
        @param numSubmit: The number of glideins to submit.
        """
        # Substitute values in submit file
        filename = os.path.join(self.get_option("GLIDEIN_DIRECTORY"),
                                "job.submit.template")

        # Submit jobs
        for i in range(numSubmit):
            self.SingleSubmit(filename)

        # Delete the submit file

    def SingleSubmit(self, filename):
        """
        Submit a single glidein job
        
        @param filename: The file (string) to submit
        
        """

        # Get the cluster specific information
        # First, the cluster tmp directory
        cluster_tmp = self.get_option("worker_tmp", "/tmp")
        remote_factory_location = self.get_option("remote_factory",
                                                  "~/bosco/campus_factory")

        # If we are submtiting to ourselves, then don't need remote cluster
        if self.get_option("CONDOR_HOST") == self.cluster_unique:
            remote_cluster = ""
        else:
            remote_cluster = self.cluster_entry

        # Get any custom attributes that are defined in the configuration
        custom_options = {}
        custom_options_raw = self.get_option("custom_condor_submit")
        if (custom_options_raw is not None):
            split_options = custom_options_raw.split(";")
            for option in split_options:
                (lside, rside) = option.split("=")
                custom_options[lside.strip()] = rside.strip()

        # TODO: These options should be moved to a better location
        predetermined_options = {"WN_TMP": cluster_tmp, \
                   "GLIDEIN_HOST": self.get_option("COLLECTOR_HOST"), \
                   "GLIDEIN_Site": self.cluster_unique, \
                   "BOSCOCluster": self.cluster_unique, \
                   "REMOTE_FACTORY": remote_factory_location, \
                   "REMOTE_CLUSTER": remote_cluster, \
                   "REMOTE_SCHEDULER": self.cluster_type, \
                   "GLIDEIN_DIR": self.get_option("GLIDEIN_DIRECTORY"), \
                   "PASSWDFILE_LOCATION": self.get_option("SEC_PASSWORD_FILE")}

        # Combine the custom options with the pre-determined options.  Prefer
        # custom options over pre-determined
        options = dict(predetermined_options.items() + custom_options.items())

        options_str = ""
        for key in options.keys():
            options_str += " -a %s=\"%s\"" % (key, options[key])

        (stdout,
         stderr) = RunExternal("condor_submit %s %s" % (filename, options_str))
        logging.debug("stdout: %s" % stdout)
        logging.debug("stderr: %s" % stderr)