def GetNumSubmit(self, idleslots, idlejobs, idleuserjobs): """ Calculate the number of glideins to submit. @param idleslots: Number of idle startd's @param idlejobs: Number of glideins in queue, but not active @param idleuserjobs: Number of idle user jobs from FLOCK_FROM @return: int - Number of glideins to submit """ # If we have already submitted enough glideins to fufill the request, # don't submit more. if max([idlejobs, idleslots]) >= idleuserjobs: logging.debug("The number of idlejobs or idleslots fufills the requested idleuserjobs, not submitting any glideins") return 0 status = ClusterStatus(status_constraint="IsUndefined(Offline)") # Check that running glideins are reporting to the collector running_glidein_jobs = status.GetRunningGlideinJobs() logging.debug("Number of running_glidein_jobs = %i", running_glidein_jobs) running_glideins = status.GetRunningGlideins() logging.debug("Number of running glideins = %i", running_glideins) if ((running_glidein_jobs * .9) > running_glideins): logging.error("I'm guessing glideins are not reporting to the collector, not submitting") return 0 # Ok, so now submit until we can't submit any more, or there are less user jobs return min([int(get_option("maxqueuedjobs")) - idlejobs, \ idleuserjobs,\ int(get_option("MaxIdleGlideins")) - idleslots])
def _DropPriv(self): factory_user = get_option("factory_user") current_uid = os.getuid() if factory_user is None: logging.warning("factory_user is not set in campus factory config file") if get_option("CONDOR_IDS"): logging.info("CONDOR_IDS is set, will use for dropping privledge") (factory_uid, factory_gid) = get_option("CONDOR_IDS").split(".") factory_uid = int(factory_uid) factory_gid = int(factory_gid) factory_user = pwd.getpwuid(factory_uid).pw_name elif current_uid == 0: logging.error("We are running as root, which can not submit condor jobs.") logging.error("Don't know who to drop privledges to.") logging.error("I can't do my job!") logging.error("Exiting...") sys.exit(1) else: # If factory user is set factory_uid = pwd.getpwnam(factory_user).pw_uid factory_gid = pwd.getpwnam(factory_user).pw_gid logging.debug("Using %i:%i for user:group" % (factory_uid, factory_gid)) # Some parts of bosco need the HOME directory and USER to be defined os.environ["HOME"] = pwd.getpwnam(factory_user).pw_dir os.environ["USER"] = factory_user os.setgid(factory_gid) os.setuid(factory_uid)
def _SetLogging(self): """ Setting the logging level and set the logging. """ logging_levels = {'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL} level = logging_levels.get(get_option("loglevel")) logdirectory = get_option("logdirectory") handler = logging.handlers.RotatingFileHandler(os.path.join(logdirectory, "campus_factory.log"), maxBytes=10000000, backupCount=5) root_logger = logging.getLogger() root_logger.setLevel(level) formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) root_logger.addHandler(handler) # Send stdout to the log stdout_logger = logging.getLogger() sl = StreamToLogger(stdout_logger, logging.INFO) sys.stdout = sl stderr_logger = logging.getLogger() sl = StreamToLogger(stderr_logger, logging.ERROR) sys.stderr = sl
def GetIdleJobs(self, status): """ Get the number of idle jobs from configured flock from hosts. @return: { user, int } - Number of idle jobs by user (dictionary) """ # Check for idle jobs to flock from if not self.UseOffline: schedds = [] # Get schedd's to query if get_option("FLOCK_FROM"): schedds = get_option("FLOCK_FROM").strip().split(",") logging.debug("Schedds to query: %s" % str(schedds)) idleuserjobs = status.GetIdleJobs(schedds) if idleuserjobs == None: logging.info( "Received None from idle user jobs, going to try later") return None # Add all the idle jobs from all the schedds, unique on user (owner) user_idle = {} for schedd in idleuserjobs.keys(): for user in idleuserjobs[schedd].keys(): if not user_idle.has_key(user): user_idle[user] = 0 user_idle[user] += idleuserjobs[schedd][user] return user_idle
def _DropPriv(self): factory_user = get_option("factory_user") current_uid = os.getuid() if factory_user is None: logging.warning( "factory_user is not set in campus factory config file") if get_option("CONDOR_IDS"): logging.info( "CONDOR_IDS is set, will use for dropping privledge") (factory_uid, factory_gid) = get_option("CONDOR_IDS").split(".") factory_uid = int(factory_uid) factory_gid = int(factory_gid) factory_user = pwd.getpwuid(factory_uid).pw_name elif current_uid == 0: logging.error( "We are running as root, which can not submit condor jobs." ) logging.error("Don't know who to drop privledges to.") logging.error("I can't do my job!") logging.error("Exiting...") sys.exit(1) else: # If factory user is set factory_uid = pwd.getpwnam(factory_user).pw_uid factory_gid = pwd.getpwnam(factory_user).pw_gid logging.debug("Using %i:%i for user:group" % (factory_uid, factory_gid)) # Some parts of bosco need the HOME directory and USER to be defined os.environ["HOME"] = pwd.getpwnam(factory_user).pw_dir os.environ["USER"] = factory_user os.setgid(factory_gid) os.setuid(factory_uid)
def _SetLogging(self): """ Setting the logging level and set the logging. """ logging_levels = { 'debug': logging.DEBUG, 'info': logging.INFO, 'warning': logging.WARNING, 'error': logging.ERROR, 'critical': logging.CRITICAL } level = logging_levels.get(get_option("loglevel")) logdirectory = get_option("logdirectory") handler = logging.handlers.RotatingFileHandler(os.path.join( logdirectory, "campus_factory.log"), maxBytes=10000000, backupCount=5) root_logger = logging.getLogger() root_logger.setLevel(level) formatter = logging.Formatter( "%(asctime)s - %(levelname)s - %(message)s") handler.setFormatter(formatter) root_logger.addHandler(handler) # Send stdout to the log stdout_logger = logging.getLogger() sl = StreamToLogger(stdout_logger, logging.INFO) sys.stdout = sl stderr_logger = logging.getLogger() sl = StreamToLogger(stderr_logger, logging.ERROR) sys.stderr = sl
def GetIdleJobs(self, status): """ Get the number of idle jobs from configured flock from hosts. @return: { user, int } - Number of idle jobs by user (dictionary) """ # Check for idle jobs to flock from if not self.UseOffline: schedds = [] # Get schedd's to query if get_option("FLOCK_FROM"): schedds = get_option("FLOCK_FROM").strip().split(",") # Add the local host to query schedds.append(get_option("CONDOR_HOST")) logging.debug("Schedds to query: %s" % str(schedds)) idleuserjobs = status.GetIdleJobs(schedds) if idleuserjobs == None: logging.info("Received None from idle user jobs, going to try later") return None # Add all the idle jobs from all the schedds, unique on user (owner) user_idle = {} for schedd in idleuserjobs.keys(): for user in idleuserjobs[schedd].keys(): if not user_idle.has_key(user): user_idle[user] = 0 user_idle[user] += idleuserjobs[schedd][user] return user_idle
def _GetClusterSpecificConfig(self, option, default): if get_option_section(self.cluster_unique, option): return get_option_section(self.cluster_unique, option) elif get_option(option): return get_option(option) else: return default
def Intialize(self): """ Function to initialize the factory's variables such as configuration and logging """ # Set the sighup signal handler signal.signal(signal.SIGHUP, self.Intialize) # Read in the configuration file self.config_file = self.options.config files_read = set_config_file(self.config_file) # check if no files read in if len(files_read) < 1: sys.stderr.write("No configuration files found. Location = %s\n" % self.config_file) sys.exit(1) self._SetLogging() if os.getuid() == 0 or get_option("factory_user"): logging.info("Detected that factory should change user") self._DropPriv() if get_option("useoffline", "false").lower() == "true": self.UseOffline = True else: self.UseOffline = False self.cluster_list = [] # Get the cluster lists if get_option("clusterlist", "") is not "": logging.debug( "Using the cluster list in the campus factory configuration.") for cluster_id in get_option("clusterlist").split(','): self.cluster_list.append( Cluster(cluster_id, useOffline=self.UseOffline)) else: # Check for the bosco cluster command (stdout, stderr) = RunExternal("bosco_cluster -l") if len(stdout) != 0 and stdout is not "No clusters configured": logging.debug("Using the cluster list installed with BOSCO") for cluster_id in stdout.split("\n"): if len(cluster_id) > 0 and cluster_id != "": self.cluster_list.append( Cluster(cluster_id, useOffline=self.UseOffline)) else: # Initialize as empty, which infers to submit 'here' self.cluster_list = [ Cluster(get_option("CONDOR_HOST"), useOffline=self.UseOffline) ] # Tar up the executables wrangler = DaemonWrangler() wrangler.Package()
def Intialize(self): """ Function to initialize the factory's variables such as configuration and logging """ # Set the sighup signal handler signal.signal(signal.SIGHUP, self.Intialize) # Read in the configuration file self.config_file = self.options.config files_read = set_config_file(self.config_file) # check if no files read in if len(files_read) < 1: sys.stderr.write("No configuration files found. Location = %s\n" % self.config_file) sys.exit(1) self._SetLogging() if os.getuid() == 0 or get_option("factory_user"): logging.info("Detected that factory should change user") self._DropPriv() if get_option("useoffline", "false").lower() == "true": self.UseOffline = True else: self.UseOffline = False self.cluster_list = [] # Get the cluster lists if get_option("clusterlist", "") is not "": logging.debug("Using the cluster list in the campus factory configuration.") for cluster_id in get_option("clusterlist").split(','): self.cluster_list.append(Cluster(cluster_id, useOffline = self.UseOffline)) else: # Check for the bosco cluster command (stdout, stderr) = RunExternal("bosco_cluster -l") if len(stdout) != 0 and stdout is not "No clusters configured": logging.debug("Using the cluster list installed with BOSCO") for cluster_id in stdout.split("\n"): if len(cluster_id) > 0 and cluster_id != "": self.cluster_list.append(Cluster(cluster_id, useOffline = self.UseOffline)) else: # Initialize as empty, which infers to submit 'here' self.cluster_list = [ Cluster(get_option("CONDOR_HOST"), useOffline = self.UseOffline) ] # Tar up the executables wrangler = DaemonWrangler() wrangler.Package()
def ClusterMeetPreferences(self): idleslots = self.status.GetIdleGlideins() if idleslots == None: logging.info("Received None from idle glideins, going to try later") raise ClusterPreferenceException("Received None from idle glideins") logging.debug("Idle glideins = %i" % idleslots) if idleslots >= int(get_option("MAXIDLEGLIDEINS", "5")): logging.info("Too many idle glideins") raise ClusterPreferenceException("Too many idle glideins") # Check for idle glidein jobs idlejobs = self.status.GetIdleGlideinJobs() if idlejobs == None: logging.info("Received None from idle glidein jobs, going to try later") raise ClusterPreferenceException("Received None from idle glidein jobs") logging.debug("Queued jobs = %i" % idlejobs) if idlejobs >= int(get_option("maxqueuedjobs", "5")): logging.info("Too many queued jobs") raise ClusterPreferenceException("Too many queued jobs") return (idleslots, idlejobs)
def SubmitGlideins(self, numSubmit): """ Submit numSubmit glideins. @param numSubmit: The number of glideins to submit. """ # Substitute values in submit file filename = os.path.join(get_option("GLIDEIN_DIRECTORY"), "job.submit.template") # Submit jobs for i in range(numSubmit): self.SingleSubmit(filename)
def SingleSubmit(self, filename): """ Submit a single glidein job @param filename: The file (string) to submit """ # Get the cluster specific information # First, the cluster tmp directory cluster_tmp = self._GetClusterSpecificConfig("worker_tmp", "/tmp") remote_factory_location = self._GetClusterSpecificConfig("remote_factory", "~/bosco/campus_factory") # If we are submtiting to ourselves, then don't need remote cluster if get_option("CONDOR_HOST") == self.cluster_unique: remote_cluster = "" else: remote_cluster = self.cluster_entry # TODO: These options should be moved to a better location options = {"WN_TMP": cluster_tmp, \ "GLIDEIN_HOST": get_option("COLLECTOR_HOST"), \ "GLIDEIN_Site": self.cluster_unique, \ "BOSCOCluster": self.cluster_unique, \ "REMOTE_FACTORY": remote_factory_location, \ "REMOTE_CLUSTER": remote_cluster, \ "REMOTE_SCHEDULER": self.cluster_type, \ "GLIDEIN_DIR": get_option("GLIDEIN_DIRECTORY"), \ "PASSWDFILE_LOCATION": get_option("SEC_PASSWORD_FILE")} options_str = "" for key in options.keys(): options_str += " -a %s=\"%s\"" % (key, options[key]) (stdout, stderr) = RunExternal("condor_submit %s %s" % (filename, options_str)) logging.debug("stdout: %s" % stdout) logging.debug("stderr: %s" % stderr)
def __init__(self, daemons=None, base_condor_dir = None, dumb_package = False): """ @param daemons: A list of daemons that will be included in the package """ if daemons is None: self.daemons = DEFAULT_GLIDEIN_DAEMONS else: self.daemons = daemons try: self.glidein_dir = get_option("GLIDEIN_DIRECTORY") except: self.glidein_dir = "" self.base_condor_dir = base_condor_dir self.dumb_package = dumb_package
def _CheckDaemons(self): """ Make sure that the daemons that are supposed to be packaged are available and readable. """ condor_sbin = get_option("SBIN") logging.debug("Found SBIN directory = %s" % condor_sbin) daemon_paths = [] for daemon in self.daemons: daemon_path = os.path.join(condor_sbin, daemon) if self._CheckFile(daemon_path): daemon_paths.append(daemon_path) # Done checking all the daemons return daemon_paths
def __init__(self, daemons=None, base_condor_dir=None, dumb_package=False): """ @param daemons: A list of daemons that will be included in the package """ if daemons is None: self.daemons = DEFAULT_GLIDEIN_DAEMONS else: self.daemons = daemons try: self.glidein_dir = get_option("GLIDEIN_DIRECTORY") except: self.glidein_dir = "" self.base_condor_dir = base_condor_dir self.dumb_package = dumb_package
def _GetDynamicLibraries(self, files, libdirs=['lib', 'lib/condor']): """ Get the dynamic libraries that the files are using (Adapted from get_condor_dlls in glideinwms) @param files: files to check for dynamic libraries """ libstodo = set() libsdone = set() rlist = [] condor_dir = get_option("RELEASE_DIR") # First, get the initial libraries for file in files: libstodo.update(self._ldd(file)) while len(libstodo) > 0: lib = libstodo.pop() # Already did library? if lib in rlist: continue if not lib.startswith(condor_dir): # Check if the library is provided by condor # If so, add the condor provided lib to process # Overriding the system's library (condor knows best?) libname = os.path.basename(lib) for libdir in libdirs: if os.path.exists(os.path.join(condor_dir, libdir, libname)): new_lib = os.path.join(condor_dir, libdir, libname) if new_lib not in rlist: libstodo.add(new_lib) libsdone.add(lib) else: # In the condor directory new_libstodo = set(self._ldd(lib)) libsdone.add(lib) libstodo.update(new_libstodo - libsdone) rlist.append(lib) return rlist
def _GetDynamicLibraries(self, files, libdirs = ['lib', 'lib/condor']): """ Get the dynamic libraries that the files are using (Adapted from get_condor_dlls in glideinwms) @param files: files to check for dynamic libraries """ libstodo = set() libsdone = set() rlist = [] condor_dir = get_option("RELEASE_DIR") # First, get the initial libraries for file in files: libstodo.update(self._ldd(file)) while len(libstodo) > 0: lib = libstodo.pop() # Already did library? if lib in rlist: continue if not lib.startswith(condor_dir): # Check if the library is provided by condor # If so, add the condor provided lib to process # Overriding the system's library (condor knows best?) libname = os.path.basename(lib) for libdir in libdirs: if os.path.exists(os.path.join(condor_dir, libdir, libname)): new_lib = os.path.join(condor_dir, libdir, libname) if new_lib not in rlist: libstodo.add(new_lib) libsdone.add(lib) else: # In the condor directory new_libstodo = set(self._ldd(lib)) libsdone.add(lib) libstodo.update(new_libstodo - libsdone) rlist.append(lib) return rlist
def SleepFactory(self): sleeptime = int(get_option("iterationtime")) logging.info("Sleeping for %i seconds" % sleeptime) time.sleep(sleeptime)