Beispiel #1
0
def spawn(sleep_time, advertize_rate, startup_dir,
          glideinDescript, frontendDescript, entries, restart_attempts, restart_interval):

    global STARTUP_DIR
    childs = {}

    starttime = time.time()
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime
    
    childs_uptime={}

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    logSupport.log.info("Starting entries %s" % entries)
    try:
        for entry_name in entries:
            childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True)
            # Get the startup time. Used to check if the entry is crashing
            # periodically and needs to be restarted.
            childs_uptime[entry_name] = list()
            childs_uptime[entry_name].insert(0, time.time())
        logSupport.log.info("Entry startup times: %s" % childs_uptime)

        for entry_name in childs.keys():
            childs[entry_name].tochild.close()
            # set it in non blocking mode
            # since we will run for a long time, we do not want to block
            for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()):
                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        # Check if freq is greater than zero.  If negative, do not do credential cleanup.
        if int(glideinDescript.data['RemoveOldCredFreq']) > 0:
            # Convert credential removal frequency from hours to seconds
            #remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 * 60
            remove_old_cred_freq = int(glideinDescript.data['RemoveOldCredFreq']) * 60 
            curr_time = time.time()
            update_time = curr_time + remove_old_cred_freq
            
            # Convert credential removal age from days to seconds
            #remove_old_cred_age =  int(glideinDescript.data['RemoveOldCredAge']) * 24 * 60 * 60
            remove_old_cred_age =  int(glideinDescript.data['RemoveOldCredAge']) * 60
            
            # Create cleaners for old credential files
            logSupport.log.info("Adding cleaners for old credentials")
            cred_base_dir = glideinDescript.data['ClientProxiesBaseDir']
            for username in frontendDescript.get_all_usernames():
                cred_base_user = os.path.join(cred_base_dir, "user_%s" % username)
                cred_user_instance_dirname = os.path.join(cred_base_user, "glidein_%s" % glideinDescript.data['GlideinName'])
                cred_cleaner = cleanupSupport.PrivsepDirCleanupCredentials(username, cred_user_instance_dirname,
                                                                           "(credential_*)",
                                                                           remove_old_cred_age)
                cleanupSupport.cred_cleaners.add_cleaner(cred_cleaner)
        
        while 1:
            # THIS IS FOR SECURITY
            # Make sure you delete the old key when its grace is up.
            # If a compromised key is left around and if attacker can somehow 
            # trigger FactoryEntry process crash, we do not want the entry to pick up 
            # the old key again when factory auto restarts it.  
            if ( (time.time() > oldkey_eoltime) and 
             (glideinDescript.data['OldPubKeyObj'] != None) ):
                glideinDescript.data['OldPubKeyObj'] = None
                glideinDescript.data['OldPubKeyType'] = None
                try:
                    glideinDescript.remove_old_key()
                    logSupport.log.info("Removed the old public key after its grace time of %s seconds" % oldkey_gracetime)
                except:
                    # Do not crash if delete fails. Just log it.
                    logSupport.log.warning("Failed to remove the old public key after its grace time")
            
            # Only removing credentials in the v3+ protocol
            # This is because it mainly matters for Corral Frontends, which only support the v3+ protocol.
            # IF freq < zero, do not do cleanup.
            if int(glideinDescript.data['RemoveOldCredFreq']) > 0 and curr_time >= update_time:
                logSupport.log.info("Checking credentials for cleanup")  
                
                # Query queue for glideins.  We don't want to remove proxies that are currently in use.
                try:
                    in_use_creds = glideFactoryLib.getCondorQCredentialList()                              
                    cleanupSupport.cred_cleaners.cleanup(in_use_creds)                         
                except:
                    logSupport.log.exception("Unable to cleanup old credentials")                                  
                
                update_time = curr_time + remove_old_cred_freq
                
            curr_time = time.time()
                                
            logSupport.log.info("Checking for credentials %s" % entries)
    
            # read in the frontend globals classad
            # Do this first so that the credentials are immediately available when the Entries startup
            try:
                classads = glideFactoryCredentials.get_globals_classads()
            except Exception:
                logSupport.log.exception("Error occurred processing globals classads: ")
                
            for classad_key in classads.keys():
                classad = classads[classad_key]
                try:
                    glideFactoryCredentials.process_global(classad, glideinDescript, frontendDescript)
                except:
                    logSupport.log.exception("Error occurred processing the globals classads: ")

            
            logSupport.log.info("Checking entries %s" % entries)
            for entry_name in childs.keys():
                child = childs[entry_name]

                # empty stdout and stderr
                try:
                    tempOut = child.fromchild.read()
                    if len(tempOut) != 0:
                        logSupport.log.warning("Child %s STDOUT: %s" % (entry_name, tempOut))
                except IOError:
                    pass # ignore
                try:
                    tempErr = child.childerr.read()
                    if len(tempErr) != 0:
                        logSupport.log.warning("Child %s STDERR: %s" % (entry_name, tempErr))
                except IOError:
                    pass # ignore

                # look for exited child
                if child.poll() != -1:
                    # the child exited
                    logSupport.log.warning("Child %s exited. Checking if it should be restarted." % (entry_name))
                    tempOut = child.fromchild.readlines()
                    tempErr = child.childerr.readlines()

                    if is_crashing_often(childs_uptime[entry_name], restart_interval, restart_attempts):
                        del childs[entry_name]
                        raise RuntimeError, "Entry '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (entry_name, tempOut, tempErr)
                    else:
                        # Restart the entry setting its restart time
                        logSupport.log.warning("Restarting child %s." % (entry_name))
                        del childs[entry_name]
                        childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True)
                        if len(childs_uptime[entry_name]) == restart_attempts:
                            childs_uptime[entry_name].pop(0)
                        childs_uptime[entry_name].append(time.time())
                        childs[entry_name].tochild.close()
                        for fd  in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()):
                            fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                            fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
                        logSupport.log.warning("Entry startup/restart times: %s" % childs_uptime)

            logSupport.log.info("Aggregate monitoring data")
            aggregate_stats(factory_downtimes.checkDowntime())
            
            # Advertise the global classad with the factory keys
            try:
                # KEL TODO need to add factory downtime?
                glideFactoryInterface.advertizeGlobal(glideinDescript.data['FactoryName'],
                                                       glideinDescript.data['GlideinName'],
                                                       glideFactoryLib.factoryConfig.supported_signtypes,
                                                       glideinDescript.data['PubKeyObj'])
        
            except Exception, e:
                logSupport.log.warning("Error occurred while trying to advertize global.\nError is: %s" % str(e))

            # do it just before the sleep - commenting out - I think that only logs are cleaned up here
            cleanupSupport.cleaners.cleanup()

            logSupport.log.info("Sleep %s secs" % sleep_time)
            time.sleep(sleep_time)
    finally:
        # cleanup at exit
        logSupport.log.info("Received signal...exit")
        try:
            try:
                clean_exit(childs)
            except:
                # if anything goes wrong, hardkill the rest
                for entry_name in childs.keys():
                    logSupport.log.info("Hard killing entry %s" % entry_name)
                    try:
                        os.kill(childs[entry_name].pid, signal.SIGKILL)
                    except OSError:
                        pass # ignore dead clients
        finally:
            logSupport.log.info("Deadvertize myself")
            try:
                glideFactoryInterface.deadvertizeFactory(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'])
            except:
                # just warn
                logSupport.log.exception("Factory deadvertize failed!")
            try:
                glideFactoryInterface.deadvertizeFactoryClientMonitoring(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'])
            except:
                # just warn
                logSupport.log.exception("Factory Monitoring deadvertize failed!")
        logSupport.log.info("All entries should be terminated")
Beispiel #2
0
def spawn(sleep_time, advertize_rate, startup_dir,
          glideinDescript, frontendDescript, entries, restart_attempts, restart_interval):

    global STARTUP_DIR
    childs = {}

    starttime = time.time()
    oldkey_gracetime = int(glideinDescript.data['OldPubKeyGraceTime'])
    oldkey_eoltime = starttime + oldkey_gracetime
    
    childs_uptime={}

    factory_downtimes = glideFactoryDowntimeLib.DowntimeFile(glideinDescript.data['DowntimesFile'])

    logSupport.log.info("Starting entries %s" % entries)
    try:
        for entry_name in entries:
            childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True)
            # Get the startup time. Used to check if the entry is crashing
            # periodically and needs to be restarted.
            childs_uptime[entry_name] = list()
            childs_uptime[entry_name].insert(0, time.time())
        logSupport.log.info("Entry startup times: %s" % childs_uptime)

        for entry_name in childs.keys():
            childs[entry_name].tochild.close()
            # set it in non blocking mode
            # since we will run for a long time, we do not want to block
            for fd in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()):
                fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)

        while 1:
            # THIS IS FOR SECURITY
            # Make sure you delete the old key when its grace is up.
            # If a compromised key is left around and if attacker can somehow 
            # trigger FactoryEntry process crash, we do not want the entry to pick up 
            # the old key again when factory auto restarts it.  
            if ( (time.time() > oldkey_eoltime) and 
             (glideinDescript.data['OldPubKeyObj'] != None) ):
                glideinDescript.data['OldPubKeyObj'] = None
                glideinDescript.data['OldPubKeyType'] = None
                try:
                    glideinDescript.remove_old_key()
                    logSupport.log.info("Removed the old public key after it's grace time of %s seconds" % oldkey_gracetime)
                except:
                    # Do not crash if delete fails. Just log it.
                    logSupport.log.info("Failed to remove the old public key after it's grace time")
                    logSupport.log.info("Failed to remove the old public key after it's grace time")
            logSupport.log.info("Checking for credentials %s" % entries)
            try:
                # read in the frontend globals classad
                # Do this first so that the credentials are immediately available when the Entries startup
                glideFactoryCredentials.process_globals(glideinDescript, frontendDescript)
            except:
                tb = traceback.format_exception(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2])
                error_str = "Error occurred processing the globals classads. \nTraceback: \n%s" % tb
                logSupport.log.warning(error_str)

            
            logSupport.log.info("Checking entries %s" % entries)
            for entry_name in childs.keys():
                child = childs[entry_name]

                # empty stdout and stderr
                try:
                    tempOut = child.fromchild.read()
                    if len(tempOut) != 0:
                        logSupport.log.warning("Child %s STDOUT: %s" % (entry_name, tempOut))
                except IOError:
                    pass # ignore
                try:
                    tempErr = child.childerr.read()
                    if len(tempErr) != 0:
                        logSupport.log.warning("Child %s STDERR: %s" % (entry_name, tempErr))
                except IOError:
                    pass # ignore

                # look for exited child
                if child.poll() != -1:
                    # the child exited
                    logSupport.log.warning("Child %s exited. Checking if it should be restarted." % (entry_name))
                    tempOut = child.fromchild.readlines()
                    tempErr = child.childerr.readlines()

                    if is_crashing_often(childs_uptime[entry_name], restart_interval, restart_attempts):
                        del childs[entry_name]
                        raise RuntimeError, "Entry '%s' has been crashing too often, quit the whole factory:\n%s\n%s" % (entry_name, tempOut, tempErr)
                    else:
                        # Restart the entry setting its restart time
                        logSupport.log.warning("Restarting child %s." % (entry_name))
                        del childs[entry_name]
                        childs[entry_name] = popen2.Popen3("%s %s %s %s %s %s %s" % (sys.executable, os.path.join(STARTUP_DIR, "glideFactoryEntry.py"), os.getpid(), sleep_time, advertize_rate, startup_dir, entry_name), True)
                        if len(childs_uptime[entry_name]) == restart_attempts:
                            childs_uptime[entry_name].pop(0)
                        childs_uptime[entry_name].append(time.time())
                        childs[entry_name].tochild.close()
                        for fd  in (childs[entry_name].fromchild.fileno(), childs[entry_name].childerr.fileno()):
                            fl = fcntl.fcntl(fd, fcntl.F_GETFL)
                            fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
                        logSupport.log.warning("Entry startup/restart times: %s" % childs_uptime)

            logSupport.log.info("Aggregate monitoring data")
            aggregate_stats(factory_downtimes.checkDowntime())
            
            # Advertise the global classad with the factory keys
            try:
                # KEL TODO need to add factory downtime?????
                glideFactoryInterface.advertizeGlobal(glideinDescript.data['FactoryName'],
                                                       glideinDescript.data['GlideinName'],
                                                       glideFactoryLib.factoryConfig.supported_signtypes,
                                                       glideinDescript.data['PubKeyObj'])
        
            except Exception, e:
                logSupport.log.warning("Error occurred while trying to advertize global.\nError is: %s" % str(e))

            # do it just before the sleep - commenting out - I think that only logs are cleaned up here
            cleanupSupport.cleaners.cleanup()

            logSupport.log.info("Sleep %s secs" % sleep_time)
            time.sleep(sleep_time)
    finally:
        # cleanup at exit
        logSupport.log.info("Received signal...exit")
        try:
            try:
                clean_exit(childs)
            except:
                # if anything goes wrong, hardkill the rest
                for entry_name in childs.keys():
                    logSupport.log.info("Hard killing entry %s" % entry_name)
                    try:
                        os.kill(childs[entry_name].pid, signal.SIGKILL)
                    except OSError:
                        pass # ignore dead clients
        finally:
            logSupport.log.info("Deadvertize myself")
            try:
                glideFactoryInterface.deadvertizeFactory(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'])
            except:
                # just warn
                logSupport.log.warning("Factory deadvertize failed!")
            try:
                glideFactoryInterface.deadvertizeFactoryClientMonitoring(glideinDescript.data['FactoryName'], glideinDescript.data['GlideinName'])
            except:
                # just warn
                logSupport.log.warning("Factory Monitoring deadvertize failed!")
        logSupport.log.info("All entries should be terminated")