def GetNumSubmit(self, idleslots, idlejobs, idleuserjobs): """ Calculate the number of glideins to submit. @param idleslots: Number of idle startd's @param idlejobs: Number of glideins in queue, but not active @param idleuserjobs: Number of idle user jobs from FLOCK_FROM @return: int - Number of glideins to submit """ # If we have already submitted enough glideins to fufill the request, # don't submit more. if max([idlejobs, idleslots]) >= idleuserjobs: logging.debug("The number of idlejobs or idleslots fufills the requested idleuserjobs, not submitting any glideins") return 0 status = ClusterStatus(status_constraint="IsUndefined(Offline)") # Check that running glideins are reporting to the collector running_glidein_jobs = status.GetRunningGlideinJobs() logging.debug("Number of running_glidein_jobs = %i", running_glidein_jobs) running_glideins = status.GetRunningGlideins() logging.debug("Number of running glideins = %i", running_glideins) if ((running_glidein_jobs * .9) > running_glideins): logging.error("I'm guessing glideins are not reporting to the collector, not submitting") return 0 # Ok, so now submit until we can't submit any more, or there are less user jobs return min([int(get_option("maxqueuedjobs")) - idlejobs, \ idleuserjobs,\ int(get_option("MaxIdleGlideins")) - idleslots])
def Stop(self): status = ClusterStatus() # Get the factory id factoryID = status.GetFactoryID() # Remove the factory job (stderr, stdout) = RunExternal("condor_rm %s" % factoryID) print "Stderr = %s" % stderr.strip()
def __init__(self, cluster_unique, useOffline = False): self.cluster_unique = cluster_unique self.status = ClusterStatus(status_constraint="IsUndefined(Offline) && BOSCOCluster =?= \"%s\"" % self.cluster_unique, queue_constraint = "BOSCOCluster =?= \"%s\"" % self.cluster_unique) self.useOffline = useOffline if useOffline: self.offline = OfflineAds() self.cluster_entry, self.cluster_type = self._ParseClusterId(cluster_unique) if self.cluster_type == None: self.cluster_type = "pbs"
def Restart(self): status = ClusterStatus() # Get the factory id factoryID = status.GetFactoryID() # Hold then release the factory in the queue (stderr, stdout) = RunExternal("condor_hold %s" % factoryID) print "Stderr = %s" % stderr.strip() #print "Stdout = %s" % stdout.strip() (stderr, stdout) = RunExternal("condor_release %s" % factoryID) print "Stderr = %s" % stderr.strip()
def Start(self): """ Start the Factory """ self.Intialize() statuses = {} status = ClusterStatus(status_constraint="IsUndefined(Offline)") offline = OfflineAds() # First, daemonize? while 1: logging.info("Starting iteration...") # Check if there are any idle jobs if not self.UseOffline: user_idle = self.GetIdleJobs(ClusterStatus()) if user_idle == None: logging.info("Received None from idle jobs") self.SleepFactory() continue idleuserjobs = 0 for user in user_idle.keys(): idleuserjobs += user_idle[user] logging.debug("Idle jobs = %i" % idleuserjobs) if idleuserjobs < 1: logging.info("No idle jobs") self.SleepFactory() continue # For each ssh'd blahp for cluster in self.cluster_list: idleslots = idlejobs = 0 if self.UseOffline: idleuserjobs = cluster.GetIdleJobs() # Check if the cluster is able to submit jobs try: (idleslots, idlejobs) = cluster.ClusterMeetPreferences() except ClusterPreferenceException, e: logging.debug("Received error from ClusterMeetPreferences") logging.debug(e) idleslots = idlejobs = None # If the cluster preferences weren't met, then move on if idleslots == None or idlejobs == None: continue # Get the offline ads to update. if self.UseOffline: num_submit = cluster.GetIdleJobs() # Determine how many glideins to submit num_submit = self.GetNumSubmit(idleslots, idlejobs, idleuserjobs) logging.info("Submitting %i glidein jobs", num_submit) cluster.SubmitGlideins(num_submit) self.SleepFactory()
class Cluster: def __init__(self, cluster_unique, useOffline=False): self.cluster_unique = cluster_unique self.status = ClusterStatus( status_constraint="IsUndefined(Offline) && BOSCOCluster =?= \"%s\"" % self.cluster_unique, queue_constraint="BOSCOCluster =?= \"%s\"" % self.cluster_unique) self.useOffline = useOffline if useOffline: self.offline = OfflineAds() self.cluster_entry, self.cluster_type = self._ParseClusterId( cluster_unique) if self.cluster_type is None: self.cluster_type = "pbs" def get_option(self, option, default=None): return campus_factory.util.CampusConfig.get_option( option, default, self.cluster_unique) def _ParseClusterId(self, cluster_unique): """ @param cluster_unique: Cluster unique string usually sent with bosco_cluster -l @return: ( cluster_entry, cluster_type ) """ # Line: [email protected]/pbs split_cluster = cluster_unique.split("/") if len(split_cluster) == 0: return (None, None) if len(split_cluster) == 1: return (split_cluster[0], None) elif len(split_cluster) == 2: return (split_cluster[0], split_cluster[1]) else: logging.error("Unable to parse cluster id: %s" % cluster_unique) logging.error( "Going to just try using entry %s, with cluster type %s" % (split_cluster[0], split_cluster[1])) return (split_cluster[0], split_cluster[1]) def ClusterMeetPreferences(self): idleslots = self.status.GetIdleGlideins() if idleslots is None: logging.info( "Received None from idle glideins, going to try later") raise ClusterPreferenceException( "Received None from idle glideins") logging.debug("Idle glideins = %i" % idleslots) if idleslots >= int(self.get_option("MAXIDLEGLIDEINS", "5")): logging.info("Too many idle glideins") raise ClusterPreferenceException("Too many idle glideins") # Check for idle glidein jobs idlejobs = self.status.GetIdleGlideinJobs() if idlejobs is None: logging.info( "Received None from idle glidein jobs, going to try later") raise ClusterPreferenceException( "Received None from idle glidein jobs") logging.debug("Queued jobs = %i" % idlejobs) if idlejobs >= int(self.get_option("maxqueuedjobs", "5")): logging.info("Too many queued jobs") raise ClusterPreferenceException("Too many queued jobs") # Check for held jobs heldjobs = self.status.GetHeldGlideins() if heldjobs is None: logging.info( "Received None from held glidein jobs, going to try later") raise ClusterPreferenceException( "Received None from held glidein jobs") logging.debug("Held jobs = %i" % heldjobs) if heldjobs >= int(self.get_option("maxheldjobs", "5")): logging.info("Too many held jobs for cluster %s" % self.cluster_unique) raise ClusterPreferenceException("Too many held jobs") return (idleslots, idlejobs) def GetIdleJobs(self): if not self.useOffline: return 0 # Update the offline cluster information toSubmit = self.offline.Update([self.cluster_unique]) # Get the delinquent sites num_submit = self.offline.GetDelinquentSites([self.cluster_unique]) logging.debug("toSubmit from offline %s", str(toSubmit)) logging.debug("num_submit = %s\n", str(num_submit)) if (len(toSubmit) > 0) or num_submit[self.cluster_unique]: idleuserjobs = max([num_submit[self.cluster_unique], 5]) logging.debug( "Offline ads detected jobs should be submitted. Idle user jobs set to %i", idleuserjobs) else: logging.debug( "Offline ads did not detect any matches or Delinquencies.") idleuserjobs = 0 return toSubmit def SubmitGlideins(self, numSubmit): """ Submit numSubmit glideins. @param numSubmit: The number of glideins to submit. """ # Substitute values in submit file filename = os.path.join(self.get_option("GLIDEIN_DIRECTORY"), "job.submit.template") # Submit jobs for i in range(numSubmit): self.SingleSubmit(filename) # Delete the submit file def SingleSubmit(self, filename): """ Submit a single glidein job @param filename: The file (string) to submit """ # Get the cluster specific information # First, the cluster tmp directory cluster_tmp = self.get_option("worker_tmp", "/tmp") remote_factory_location = self.get_option("remote_factory", "~/bosco/campus_factory") # If we are submtiting to ourselves, then don't need remote cluster if self.get_option("CONDOR_HOST") == self.cluster_unique: remote_cluster = "" else: remote_cluster = self.cluster_entry # Get any custom attributes that are defined in the configuration custom_options = {} custom_options_raw = self.get_option("custom_condor_submit") if (custom_options_raw is not None): split_options = custom_options_raw.split(";") for option in split_options: (lside, rside) = option.split("=") custom_options[lside.strip()] = rside.strip() # TODO: These options should be moved to a better location predetermined_options = {"WN_TMP": cluster_tmp, \ "GLIDEIN_HOST": self.get_option("COLLECTOR_HOST"), \ "GLIDEIN_Site": self.cluster_unique, \ "BOSCOCluster": self.cluster_unique, \ "REMOTE_FACTORY": remote_factory_location, \ "REMOTE_CLUSTER": remote_cluster, \ "REMOTE_SCHEDULER": self.cluster_type, \ "GLIDEIN_DIR": self.get_option("GLIDEIN_DIRECTORY"), \ "PASSWDFILE_LOCATION": self.get_option("SEC_PASSWORD_FILE")} # Combine the custom options with the pre-determined options. Prefer # custom options over pre-determined options = dict(predetermined_options.items() + custom_options.items()) options_str = "" for key in options.keys(): options_str += " -a %s=\"%s\"" % (key, options[key]) (stdout, stderr) = RunExternal("condor_submit %s %s" % (filename, options_str)) logging.debug("stdout: %s" % stdout) logging.debug("stderr: %s" % stderr)