def start_accepter(): unique_id = rsa_publickey_to_string(configuration['publickey']) unique_id = sha_hexhash(unique_id) + str(configuration['service_vessel']) unique_id += "." + NAME_SERVER # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname else: for possibleport in configuration['ports']: try: servicelogger.log("[INFO]: Trying to wait") # We advertise the unique_id first so that we can perform waitforconn # on it later. It's tempting to do a waitforconn directly on the # current IP, but IPs are not unique. If we are behind a NAT, our IP # can be some private address which may have duplicates registered in # the NAT forwarder. As a result, a client may not be able to locate # us within the NAT forwarder. Hence, waitforconn must occur on a unique # resolvable name. advertise_to_DNS(unique_id) timeout_waitforconn(unique_id, possibleport, nmconnectionmanager.connection_handler, timeout=10, use_shim=True, shim_string=default_shim) except Exception, e: servicelogger.log("[ERROR]: when calling waitforconn for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: # the waitforconn was completed so the accepter is started accepter_state['lock'].acquire() accepter_state['started']= True accepter_state['lock'].release() # assign the nodemanager name myname = unique_id + ":" + str(possibleport) servicelogger.log("[INFO]: Now listening as " + myname) break else: servicelogger.log("[ERROR]: cannot find a port for waitforconn.")
def run(self): try: while True: # if there are any requests, add them to the dict. add_requests() if len(connection_dict)>0: # get the "first" request conn = pop_request() nmrequesthandler.handle_request(conn) else: # check at most twice a second (if nothing is new) time.sleep(self.sleeptime) except: servicelogger.log_last_exception() raise
def run(self): try: while True: if len(connection_dict) > 0: # get the "first" request conn = pop_request() # Removing this logging which seems excessive... # servicelogger.log('start handle_request:'+str(id(conn))) nmrequesthandler.handle_request(conn) # servicelogger.log('finish handle_request:'+str(id(conn))) else: # check at most twice a second (if nothing is new) time.sleep(self.sleeptime) except: servicelogger.log_last_exception() raise
def run(self): try: while True: if len(connection_dict)>0: # get the "first" request conn = pop_request() # Removing this logging which seems excessive... # servicelogger.log('start handle_request:'+str(id(conn))) nmrequesthandler.handle_request(conn) # servicelogger.log('finish handle_request:'+str(id(conn))) else: # check at most twice a second (if nothing is new) time.sleep(self.sleeptime) except: servicelogger.log_last_exception() raise
def handle_request(socketobj): try: # let's get the request... # BUG: Should prevent endless data / slow retrival attacks fullrequest = session_recvmessage(socketobj) # Armon: Catch a vanilla exception because repy emulated_sockets # will raise Exception when the socket has been closed. # This is changed from just passing through socket.error, # which we were catching previously. except Exception, e: # close if possible safe_close(socketobj) # I can't handle this, let's exit # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?) servicelogger.log_last_exception() return
def safe_log_last_exception(): """ Log the last exception in a way that cannot throw an exception. First try to log using the servicelogger, then just try to print the message. """ try: # Get the last exception in case the servicelogger fails. exceptionstr = traceback.format_exc() except: pass try: servicelogger.log_last_exception() except: try: print exceptionstr except: # As the standard output streams aren't closed, it would seem that this # should never happen. If it does, though, what can we do to log the # message, other than directly write to a file? pass
def handle_request(socketobj): # always close the socketobj try: try: # let's get the request... # BUG: Should prevent endless data / slow retrival attacks fullrequest = session.session_recvmessage(socketobj) # Armon: Catch a vanilla exception because repy emulated_sockets # will raise Exception when the socket has been closed. # This is changed from just passing through socket.error, # which we were catching previously. except Exception, e: #JAC: Fix for the exception logging observed in #992 if 'Socket closed' in str(e) or 'timed out!' in str(e): servicelogger.log('Connection abruptly closed during recv') return elif 'Bad message size' in str(e): servicelogger.log('Received bad message size') return else: # I can't handle this, let's exit # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?) servicelogger.log_last_exception() return # handle the request as appropriate try: retstring = process_API_call(fullrequest) # Bad parameters, signatures, etc. except nmAPI.BadRequest,e: session.session_sendmessage(socketobj, str(e)+"\nError") return
def start_accepter(): global accepter_thread # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log("Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Use getmyip() to find the IP address the nodemanager should # listen on for incoming connections. This will work correctly # if IP/interface preferences have been set. # We only want to call getmyip() once rather than in the loop # since this potentially avoids rebuilding the allowed IP # cache for each possible port bind_ip = getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for possibleport in configuration['ports']: try: # Use a Repy socket for listening. This lets us override # the listenforconnection function with a version using an # Affix stack easily; furthermore, we can transparently use # the Repy sockettimeout library to protect against malicious # clients that feed us endless data (or no data) to tie up # the connection. try: serversocket = timeout_listenforconnection(bind_ip, possibleport, 10) except (AlreadyListeningError, DuplicateTupleError), e: # These are rather dull errors that will result in us # trying a different port. Don't print a stack trace. servicelogger.log("[ERROR]: listenforconnection for address " + bind_ip + ":" + str(possibleport) + " failed with error '" + repr(e) + "'. Retrying.") continue # Assign the nodemanager name. # We re-retrieve our address using getmyip as we may now be using # a zenodotus name instead. myname_port = str(getmyip()) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR] setting up nodemanager serversocket " + "on address " + bind_ip + ":" + str(possibleport) + ": " + repr(e)) servicelogger.log_last_exception() else: break
def run(self): # Put everything in a try except block so that if badness happens, we can # log it before dying. try: while True: # remove stale items from the advertise dict. This is important because # we're using membership in the dict to indicate a need to advertise clean_advertise_dict() # this list contains the keys we will advertise advertisekeylist = [] # JAC: advertise under the node's key if rsa_publickey_to_string( self.nodekey ) not in lastadvertisedict and self.nodekey not in advertisekeylist: advertisekeylist.append(self.nodekey) # make a copy so there isn't an issue with a race for vesselname in self.addict.keys()[:]: try: thisentry = self.addict[vesselname].copy() except KeyError: # the entry must have been removed in the meantime. Skip it! continue # if I advertise the vessel... if thisentry['advertise']: # add the owner key if not there already... if rsa_publickey_to_string( thisentry['ownerkey'] ) not in lastadvertisedict and thisentry[ 'ownerkey'] not in advertisekeylist: advertisekeylist.append(thisentry['ownerkey']) # and all user keys if not there already for userkey in thisentry['userkeys']: if rsa_publickey_to_string( userkey ) not in lastadvertisedict and userkey not in advertisekeylist: advertisekeylist.append(userkey) # there should be no dups. assert (advertisekeylist == listops_uniq(advertisekeylist)) # now that I know who to announce to, send messages to annouce my IP and # port to all keys I support for advertisekey in advertisekeylist: try: advertise_announce(advertisekey, str(myname), adTTL) # mark when we advertise lastadvertisedict[rsa_publickey_to_string( advertisekey)] = getruntime() # If the announce succeeded, and node was offline, log info message # and switch it back to online mode. if self.is_offline: info_msg = 'Node is back online.' if self.error_count: info_msg += ' (Encountered ' + str(self.error_count) + \ ' advertise errors)' servicelogger.log('[INFO]: ' + info_msg) self.error_count = 0 self.is_offline = False except AdvertiseError, e: # If all announce requests failed, assume node has # gone offline, if str( e ) == "None of the advertise services could be contacted": self.is_offline = True # Log an error message after every 'N' failures if (self.error_count % error_skip_count == 0): servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return
def start_accepter(): global accepter_thread # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log("Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # For now, we'll use the second method and use the sockettimeout # library so we can still use a timeout to ensure we don't have # any malicious clients that feed us endless data (or no data) # to tie up the connection. Note that if we are using Affix, # we will be using a TimeoutAffix to achieve the equivalent # outcome. serversocket = timeout_listenforconnection(bind_ip, possibleport,10) # assign the nodemanager name. # We re-retrieve our address using getmyip as we may now be using # a zenodotus name instead. myname_port = str(getmyip()) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: break else: servicelogger.log("[ERROR]: cannot find a port for recvmess")
def handle_threading_error(): """ <Purpose> Handles a repy node failing with ThreadErr. If repy is allowed to use more than 10% of the current threads, reduce the global thread count by 50% and stop all existing vessels <Arguments> None <Exceptions> None <Side Effects> May re-write all resource files and stop all vessels <Returns> None """ # Make a log of this servicelogger.log("[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step") # Get all the names of the vessels vesselnamelist = nmAPI.vesseldict.keys() # read in all of the resource files so that we can look at and possibly # manipulate them. resourcedicts = {} for vesselname in vesselnamelist: resourcedicts[vesselname] = resourcemanipulation.read_resourcedict_from_file('resource.'+vesselname) # Get the number of threads Repy has allocated allowedthreadcount = 0 for vesselname in vesselnamelist: allowedthreadcount = allowedthreadcount + resourcedicts[vesselname]['events'] # Get the total number os system threads currently used totalusedthreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[WARNING]:System Threads: "+str(totalusedthreads)+" Repy Allocated Threads: "+str(allowedthreadcount)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allowedthreadcount < totalusedthreads * threshold: return servicelogger.log("[ERROR]:Reducing number of system threads!") #### We are above the threshold! Let's cut everything by 1/2 # First, update the resource files for vesselname in vesselnamelist: # cut the events by 1/2 resourcedicts[vesselname]['events'] = resourcedicts[vesselname]['events'] / 2 # write out the new resource files... resourcemanipulation.write_resourcedict_to_file(resourcedicts[vesselname], 'resource.'+vesselname) # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vesselname in vesselnamelist: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vesselname,stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log("[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: "+str(exp)) servicelogger.log_last_exception()
def start_accepter(): global accepter_thread global affix_enabled global affix_stack_string # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname else: # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # Check to see if AFFIX is enabled. try: affix_enabled_lookup = advertise_lookup(enable_affix_key)[-1] # Now we check if the last entry is True or False. if affix_enabled_lookup == 'True': affix_stack_string = advertise_lookup(affix_service_key)[-1] affix_enabled = True else: affix_enabled = False except AdvertiseError: affix_enabled = False except ValueError: affix_enabled = False except IndexError: # This will occur if the advertise server returns an empty list. affix_enabled = False # If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket. if affix_enabled: # Here we are going to use a for loop to find a second available port # for us to use for the LegacyShim. Since the LegacyShim opens up two # tcpserversocket, it needs two available ports. The first for a normal # repy listenforconnection call, the second for shim enabled # listenforconnection call. for shimportindex in range(portindex+1, len(configuration['ports'])): shimport = configuration['ports'][shimportindex] affix_legacy_string = "(LegacyShim," + str(shimport) + ",0)" + affix_stack_string affix_object = ShimStackInterface(affix_legacy_string) serversocket = affix_object.listenforconnection(bind_ip, possibleport) servicelogger.log("[INFO]Started accepter thread with Affix string: " + affix_legacy_string) break else: # This is the case if we weren't able to find any port to listen on # With the legacy shim. raise ShimError("Unable to create create tcpserversocket with shims using port:" + str(possibleport)) else: # If AFFIX is not enabled, then we open up a normal tcpserversocket. # For now, we'll use the second method. serversocket = listenforconnection(bind_ip, possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: # assign the nodemanager name myname = str(bind_ip) + ":" + str(possibleport) break else: servicelogger.log("[ERROR]: cannot find a port for recvmess")
def handle_threading_error(nmAPI): """ <Purpose> Handles a repy node failing with ThreadErr. Reduces global thread count by 50%. Restarts all existing vesselts <Arguments> nmAPI: the nmAPI module -- passed to the function to avoid import loops; see ticket #590 for more information about this. """ # Make a log of this servicelogger.log( "[ERROR]:A Repy vessel has exited with ThreadErr status. Patching restrictions and reseting all vessels." ) # Get the number of threads Repy has allocated allocatedThreads = get_allocated_threads() # Get the number os system threads currently systemThreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log( "[ERROR]:System Threads: " + str(systemThreads) + " Repy Allocated Threads: " + str(allocatedThreads) ) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allocatedThreads < systemThreads * threshold: return # We are continuing, so we are above the threshold! # First, update the restrictions update_restrictions() # Then, stop the vessels # Get all the vessels vessels = nmAPI.vesseldict.keys() # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vessel in vessels: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vessel, stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log("[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp)) servicelogger.log_last_exception()
def run(self): # Put everything in a try except block so that if badness happens, we can # log it before dying. try: while True: # remove stale items from the advertise dict. This is important because # we're using membership in the dict to indicate a need to advertise clean_advertise_dict() # this list contains the keys we will advertise advertisekeylist = [] # JAC: advertise under the node's key if rsa_publickey_to_string(self.nodekey) not in lastadvertisedict and self.nodekey not in advertisekeylist: advertisekeylist.append(self.nodekey) # make a copy so there isn't an issue with a race for vesselname in self.addict.keys()[:]: try: thisentry = self.addict[vesselname].copy() except KeyError: # the entry must have been removed in the meantime. Skip it! continue # if I advertise the vessel... if thisentry['advertise']: # add the owner key if not there already... if rsa_publickey_to_string(thisentry['ownerkey']) not in lastadvertisedict and thisentry['ownerkey'] not in advertisekeylist: advertisekeylist.append(thisentry['ownerkey']) # and all user keys if not there already for userkey in thisentry['userkeys']: if rsa_publickey_to_string(userkey) not in lastadvertisedict and userkey not in advertisekeylist: advertisekeylist.append(userkey) # there should be no dups. assert(advertisekeylist == listops_uniq(advertisekeylist)) # now that I know who to announce to, send messages to annouce my IP and # port to all keys I support for advertisekey in advertisekeylist: try: advertise_announce(advertisekey, str(myname), adTTL) # mark when we advertise lastadvertisedict[rsa_publickey_to_string(advertisekey)] = getruntime() # If the announce succeeded, and node was offline, log info message # and switch it back to online mode. if self.is_offline: info_msg = 'Node is back online.' if self.error_count: info_msg += ' (Encountered ' + str(self.error_count) + \ ' advertise errors)' servicelogger.log('[INFO]: ' + info_msg) self.error_count = 0 self.is_offline = False except AdvertiseError, e: # If all announce requests failed, assume node has # gone offline, if str(e) == "None of the advertise services could be contacted": self.is_offline = True # Log an error message after every 'N' failures if (self.error_count % error_skip_count == 0): servicelogger.log('AdvertiseError occured, continuing: '+str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log('AdvertiseError occured, continuing: '+str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return
# handle the request as appropriate try: retstring = process_API_call(fullrequest) # Bad parameters, signatures, etc. except nmAPI.BadRequest,e: session.session_sendmessage(socketobj, str(e)+"\nError") return # Other exceptions only should happen on an internal error and should be # captured by servicelogger.log except Exception,e: servicelogger.log_last_exception() session.session_sendmessage(socketobj,"Internal Error\nError") return # send the output of the command... session.session_sendmessage(socketobj,retstring) except Exception, e: #JAC: Fix for the exception logging observed in #992 if 'Socket closed' in str(e) or 'timed out!' in str(e): servicelogger.log('Connection abruptly closed in send') return else: raise finally:
def start_accepter(): global accepter_thread # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log( "Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Use getmyip() to find the IP address the nodemanager should # listen on for incoming connections. This will work correctly # if IP/interface preferences have been set. # We only want to call getmyip() once rather than in the loop # since this potentially avoids rebuilding the allowed IP # cache for each possible port bind_ip = getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for possibleport in configuration['ports']: try: # Use a Repy socket for listening. This lets us override # the listenforconnection function with a version using an # Affix stack easily; furthermore, we can transparently use # the Repy sockettimeout library to protect against malicious # clients that feed us endless data (or no data) to tie up # the connection. try: serversocket = timeout_listenforconnection( bind_ip, possibleport, 10) except (AlreadyListeningError, DuplicateTupleError), e: # These are rather dull errors that will result in us # trying a different port. Don't print a stack trace. servicelogger.log( "[ERROR]: listenforconnection for address " + bind_ip + ":" + str(possibleport) + " failed with error '" + repr(e) + "'. Retrying.") continue # Assign the nodemanager name. # We re-retrieve our address using getmyip as we may now be using # a zenodotus name instead. myname_port = str(getmyip()) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log( "[ERROR] setting up nodemanager serversocket " + "on address " + bind_ip + ":" + str(possibleport) + ": " + repr(e)) servicelogger.log_last_exception() else: break
def run(self): try: while True: # the race condition here is that they might delete something and I will # check it. This is okay. I'll end up getting a KeyError when trying # to update the dictionary (checked below) or look at the old entry. for vesselname in self.statusdict.keys()[:]: try: statusfilename = self.statusdict[vesselname][ 'statusfilename'] oldstatus = self.statusdict[vesselname]['status'] except KeyError: # race condition, this was removed in the meantime. continue # there should be a status file (assuming we've inited) try: status, timestamp = statusstorage.read_status( statusfilename) except IOError, e: # if the file exists, raise the exception since we don't know what # it is about. if e[0] != 2: raise # file not found. This means it is fresh... status = 'Fresh' timestamp = time.time() # Armon: Check if status is ThreadErr, this is a critical error condition # that requires lowering the global thread count, and reseting all vessels if status == "ThreadErr": # Check if this is the first time for this timestamp # Since the status file is not removed, this is necessary so that we do not # continuously trigger the error handling code if not timestamp in self.threadErrSet: # Add the timestamp self.threadErrSet.add(timestamp) # Call the error handling module nmthreadingerror.handle_threading_error(self.nmAPI) # The status has a timestamp in case the process is killed harshly and # needs to be restarted. This allows ordering of status reports staleness = time.time() - timestamp if staleness < 0: # time is running backwards, likely a NTP update (allow it)... # print "Time is running backwards by increment '"+str(staleness)+"', allowing this" newstatus = status elif staleness > updatebound: # stale? newstatus = oldstatus if oldstatus == 'Started': # BUG: What happens if we're wrong and it's alive? What do we do? # How do we detect and fix this safely? newstatus = 'Stale' # We set the timestamp so that our update happens in the table... timestamp = time.time() - updatebound else: # it seems to be okay. Use the given status newstatus = status update_status(self.statusdict, vesselname, newstatus, timestamp) time.sleep(self.sleeptime) except Exception, e: servicelogger.log_last_exception() raise e
def run(self): try: while True: # the race condition here is that they might delete something and I will # check it. This is okay. I'll end up getting a KeyError when trying # to update the dictionary (checked below) or look at the old entry. for vesselname in self.statusdict.keys()[:]: try: statusfilename = self.statusdict[vesselname]['statusfilename'] oldstatus = self.statusdict[vesselname]['status'] except KeyError: # race condition, this was removed in the meantime. continue # there should be a status file (assuming we've inited) try: status,timestamp = statusstorage.read_status(statusfilename) except IOError, e: # if the file exists, raise the exception since we don't know what # it is about. if e[0] != 2: raise # file not found. This means it is fresh... status = 'Fresh' timestamp = time.time() # Armon: Check if status is ThreadErr, this is a critical error condition # that requires lowering the global thread count, and reseting all vessels if status == "ThreadErr": # Check if this is the first time for this timestamp # Since the status file is not removed, this is necessary so that we do not # continuously trigger the error handling code if not timestamp in self.threadErrSet: # Add the timestamp self.threadErrSet.add(timestamp) # Call the error handling module nmthreadingerror.handle_threading_error(self.nmAPI) # The status has a timestamp in case the process is killed harshly and # needs to be restarted. This allows ordering of status reports staleness = time.time() - timestamp if staleness < 0: # time is running backwards, likely a NTP update (allow it)... # print "Time is running backwards by increment '"+str(staleness)+"', allowing this" newstatus = status elif staleness > updatebound: # stale? newstatus = oldstatus if oldstatus == 'Started': # BUG: What happens if we're wrong and it's alive? What do we do? # How do we detect and fix this safely? newstatus = 'Stale' # We set the timestamp so that our update happens in the table... timestamp = time.time() - updatebound else: # it seems to be okay. Use the given status newstatus = status update_status(self.statusdict, vesselname, newstatus, timestamp) time.sleep(self.sleeptime) except Exception,e: servicelogger.log_last_exception() raise e
def handle_threading_error(): """ <Purpose> Handles a repy node failing with ThreadErr. If repy is allowed to use more than 10% of the current threads, reduce the global thread count by 50% and stop all existing vessels <Arguments> None <Exceptions> None <Side Effects> May re-write all resource files and stop all vessels <Returns> None """ # Make a log of this servicelogger.log( "[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step" ) # Get all the names of the vessels vesselnamelist = nmAPI.vesseldict.keys() # read in all of the resource files so that we can look at and possibly # manipulate them. resourcedicts = {} for vesselname in vesselnamelist: resourcedicts[ vesselname] = resourcemanipulation.read_resourcedict_from_file( 'resource.' + vesselname) # Get the number of threads Repy has allocated allowedthreadcount = 0 for vesselname in vesselnamelist: allowedthreadcount = allowedthreadcount + resourcedicts[vesselname][ 'events'] # Get the total number os system threads currently used totalusedthreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[WARNING]:System Threads: " + str(totalusedthreads) + " Repy Allocated Threads: " + str(allowedthreadcount)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allowedthreadcount < totalusedthreads * threshold: return servicelogger.log("[ERROR]:Reducing number of system threads!") #### We are above the threshold! Let's cut everything by 1/2 # First, update the resource files for vesselname in vesselnamelist: # cut the events by 1/2 resourcedicts[vesselname][ 'events'] = resourcedicts[vesselname]['events'] / 2 # write out the new resource files... resourcemanipulation.write_resourcedict_to_file( resourcedicts[vesselname], 'resource.' + vesselname) # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vesselname in vesselnamelist: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vesselname, stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log( "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp)) servicelogger.log_last_exception()
myname_port = str(bind_ip) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: break else: servicelogger.log("[ERROR]: cannot find a port for recvmess") # check infrequently time.sleep(configuration['pollfrequency'])
def handle_threading_error(nmAPI): """ <Purpose> Handles a repy node failing with ThreadErr. Reduces global thread count by 50%. Restarts all existing vesselts <Arguments> nmAPI: the nmAPI module -- passed to the function to avoid import loops; see ticket #590 for more information about this. """ # Make a log of this servicelogger.log( "[ERROR]:A Repy vessel has exited with ThreadErr status. Patching restrictions and reseting all vessels." ) # Get the number of threads Repy has allocated allocatedThreads = get_allocated_threads() # Get the number os system threads currently systemThreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[ERROR]:System Threads: " + str(systemThreads) + " Repy Allocated Threads: " + str(allocatedThreads)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allocatedThreads < systemThreads * threshold: return # We are continuing, so we are above the threshold! # First, update the restrictions update_restrictions() # Then, stop the vessels # Get all the vessels vessels = nmAPI.vesseldict.keys() # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vessel in vessels: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vessel, stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log( "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp)) servicelogger.log_last_exception()