def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript, frontendDescript, entries, restart_attempts, restart_interval): global STARTUP_DIR childs = {} starttime = time.time() oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime childs_uptime={} factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) logSupport.log.info("Starting entries %s" % entries) try: for entry_name in entries: childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True) # Get the startup time. Used to check if the entry is crashing # periodically and needs to be restarted. childs_uptime[entry_name] = list() childs_uptime[entry_name].insert(0, time.time()) logSupport.log.info("Entry startup times: %s" % childs_uptime) for entry_name in childs.keys(): childs[entry_name].tochild.close() # set it in non blocking mode # since we will run for a long time, we do not want to block for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) # Check if freq is greater than zero. If negative, do not do credential cleanup. if int(glideinDescript.data['RemoveOldCredFreq']) > 0: # Convert credential removal frequency from hours to seconds #remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60 remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 curr_time = time.time() update_time = curr_time + remove_old_cred_freq # Convert credential removal age from days to seconds #remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 24 * 60 * 60 remove_old_cred_age = int(glideinDescript.data['RemoveOldCredAge']) * 60 # Create cleaners for old credential files logSupport.log.info("Adding cleaners for old credentials") cred_base_dir = glideinDescript.data['ClientProxiesBaseDir'] for username in frontendDescript.get_all_usernames(): cred_base_user = os.path.join(cred_base_dir, "user_%s" % username) cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName']) cred_cleaner = cleanupSupport.PrivsepDirCleanupCredentials(username, cred_user_instance_dirname, "(credential_*)", remove_old_cred_age) cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner) while 1: # THIS IS FOR SECURITY # Make sure you delete the old key when its grace is up. # If a compromised key is left around and if attacker can somehow # trigger FactoryEntry process crash, we do not want the entry to pick up # the old key again when factory auto restarts it. if ( (time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] != None) ): glideinDescript.data['OldPubKeyObj'] = None glideinDescript.data['OldPubKeyType'] = None try: glideinDescript.remove_old_key() logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime) except: # Do not crash if delete fails. Just log it. logSupport.log.warning("Failed to remove the old public key after its grace time") # Only removing credentials in the v3+ protocol # This is because it mainly matters for Corral Frontends, which only support the v3+ protocol. # IF freq < zero, do not do cleanup. if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time: logSupport.log.info("Checking credentials for cleanup") # Query queue for glideins. We don't want to remove proxies that are currently in use. try: in_use_creds = glideFactoryLib.getCondorQCredentialList() cleanupSupport.cred_cleaners.cleanup(in_use_creds) except: logSupport.log.exception("Unable to cleanup old credentials") update_time = curr_time + remove_old_cred_freq curr_time = time.time() logSupport.log.info("Checking for credentials %s" % entries) # read in the frontend globals classad # Do this first so that the credentials are immediately available when the Entries startup try: classads = glideFactoryCredentials.get_globals_classads() except Exception: logSupport.log.exception("Error occurred processing globals classads: ") for classad_key in classads.keys(): classad = classads[classad_key] try: glideFactoryCredentials.process_global(classad, glideinDescript, frontendDescript) except: logSupport.log.exception("Error occurred processing the globals classads: ") logSupport.log.info("Checking entries %s" % entries) for entry_name in childs.keys(): child = childs[entry_name] # empty stdout and stderr try: tempOut = child.fromchild.read() if len(tempOut) != 0: logSupport.log.warning("Child %s STDOUT: %s" % (entry_name, tempOut)) except IOError: pass # ignore try: tempErr = child.childerr.read() if len(tempErr) != 0: logSupport.log.warning("Child %s STDERR: %s" % (entry_name, tempErr)) except IOError: pass # ignore # look for exited child if child.poll() != -1: # the child exited logSupport.log.warning("Child %s exited. Checking if it should be restarted." % (entry_name)) tempOut = child.fromchild.readlines() tempErr = child.childerr.readlines() if is_crashing_often(childs_uptime[entry_name], restart_interval, restart_attempts): del childs[entry_name] raise RuntimeError, "Entry '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (entry_name, tempOut, tempErr) else: # Restart the entry setting its restart time logSupport.log.warning("Restarting child %s." % (entry_name)) del childs[entry_name] childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True) if len(childs_uptime[entry_name]) == restart_attempts: childs_uptime[entry_name].pop(0) childs_uptime[entry_name].append(time.time()) childs[entry_name].tochild.close() for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) logSupport.log.warning("Entry startup/restart times: %s" % childs_uptime) logSupport.log.info("Aggregate monitoring data") aggregate_stats(factory_downtimes.checkDowntime()) # Advertise the global classad with the factory keys try: # KEL TODO need to add factory downtime? glideFactoryInterface.advertizeGlobal(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'], glideFactoryLib.factoryConfig.supported_signtypes, glideinDescript.data['PubKeyObj']) except Exception, e: logSupport.log.warning("Error occurred while trying to advertize global.\nError is: %s" % str(e)) # do it just before the sleep - commenting out - I think that only logs are cleaned up here cleanupSupport.cleaners.cleanup() logSupport.log.info("Sleep %s secs" % sleep_time) time.sleep(sleep_time) finally: # cleanup at exit logSupport.log.info("Received signal...exit") try: try: clean_exit(childs) except: # if anything goes wrong, hardkill the rest for entry_name in childs.keys(): logSupport.log.info("Hard killing entry %s" % entry_name) try: os.kill(childs[entry_name].pid, signal.SIGKILL) except OSError: pass # ignore dead clients finally: logSupport.log.info("Deadvertize myself") try: glideFactoryInterface.deadvertizeFactory(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: # just warn logSupport.log.exception("Factory deadvertize failed!") try: glideFactoryInterface.deadvertizeFactoryClientMonitoring(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: # just warn logSupport.log.exception("Factory Monitoring deadvertize failed!") logSupport.log.info("All entries should be terminated")
def spawn(sleep_time, advertize_rate, startup_dir, glideinDescript, frontendDescript, entries, restart_attempts, restart_interval): global STARTUP_DIR childs = {} starttime = time.time() oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime']) oldkey_eoltime = starttime + oldkey_gracetime childs_uptime={} factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile']) logSupport.log.info("Starting entries %s" % entries) try: for entry_name in entries: childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True) # Get the startup time. Used to check if the entry is crashing # periodically and needs to be restarted. childs_uptime[entry_name] = list() childs_uptime[entry_name].insert(0, time.time()) logSupport.log.info("Entry startup times: %s" % childs_uptime) for entry_name in childs.keys(): childs[entry_name].tochild.close() # set it in non blocking mode # since we will run for a long time, we do not want to block for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) while 1: # THIS IS FOR SECURITY # Make sure you delete the old key when its grace is up. # If a compromised key is left around and if attacker can somehow # trigger FactoryEntry process crash, we do not want the entry to pick up # the old key again when factory auto restarts it. if ( (time.time() > oldkey_eoltime) and (glideinDescript.data['OldPubKeyObj'] != None) ): glideinDescript.data['OldPubKeyObj'] = None glideinDescript.data['OldPubKeyType'] = None try: glideinDescript.remove_old_key() logSupport.log.info("Removed the old public key after it's grace time of %s seconds" % oldkey_gracetime) except: # Do not crash if delete fails. Just log it. logSupport.log.info("Failed to remove the old public key after it's grace time") logSupport.log.info("Failed to remove the old public key after it's grace time") logSupport.log.info("Checking for credentials %s" % entries) try: # read in the frontend globals classad # Do this first so that the credentials are immediately available when the Entries startup glideFactoryCredentials.process_globals(glideinDescript, frontendDescript) except: tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2]) error_str = "Error occurred processing the globals classads. \nTraceback: \n%s" % tb logSupport.log.warning(error_str) logSupport.log.info("Checking entries %s" % entries) for entry_name in childs.keys(): child = childs[entry_name] # empty stdout and stderr try: tempOut = child.fromchild.read() if len(tempOut) != 0: logSupport.log.warning("Child %s STDOUT: %s" % (entry_name, tempOut)) except IOError: pass # ignore try: tempErr = child.childerr.read() if len(tempErr) != 0: logSupport.log.warning("Child %s STDERR: %s" % (entry_name, tempErr)) except IOError: pass # ignore # look for exited child if child.poll() != -1: # the child exited logSupport.log.warning("Child %s exited. Checking if it should be restarted." % (entry_name)) tempOut = child.fromchild.readlines() tempErr = child.childerr.readlines() if is_crashing_often(childs_uptime[entry_name], restart_interval, restart_attempts): del childs[entry_name] raise RuntimeError, "Entry '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (entry_name, tempOut, tempErr) else: # Restart the entry setting its restart time logSupport.log.warning("Restarting child %s." % (entry_name)) del childs[entry_name] childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True) if len(childs_uptime[entry_name]) == restart_attempts: childs_uptime[entry_name].pop(0) childs_uptime[entry_name].append(time.time()) childs[entry_name].tochild.close() for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()): fl = fcntl.fcntl(fd, fcntl.F_GETFL) fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK) logSupport.log.warning("Entry startup/restart times: %s" % childs_uptime) logSupport.log.info("Aggregate monitoring data") aggregate_stats(factory_downtimes.checkDowntime()) # Advertise the global classad with the factory keys try: # KEL TODO need to add factory downtime????? glideFactoryInterface.advertizeGlobal(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'], glideFactoryLib.factoryConfig.supported_signtypes, glideinDescript.data['PubKeyObj']) except Exception, e: logSupport.log.warning("Error occurred while trying to advertize global.\nError is: %s" % str(e)) # do it just before the sleep - commenting out - I think that only logs are cleaned up here cleanupSupport.cleaners.cleanup() logSupport.log.info("Sleep %s secs" % sleep_time) time.sleep(sleep_time) finally: # cleanup at exit logSupport.log.info("Received signal...exit") try: try: clean_exit(childs) except: # if anything goes wrong, hardkill the rest for entry_name in childs.keys(): logSupport.log.info("Hard killing entry %s" % entry_name) try: os.kill(childs[entry_name].pid, signal.SIGKILL) except OSError: pass # ignore dead clients finally: logSupport.log.info("Deadvertize myself") try: glideFactoryInterface.deadvertizeFactory(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: # just warn logSupport.log.warning("Factory deadvertize failed!") try: glideFactoryInterface.deadvertizeFactoryClientMonitoring(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName']) except: # just warn logSupport.log.warning("Factory Monitoring deadvertize failed!") logSupport.log.info("All entries should be terminated")