def process_API_call(fullrequest): callname = fullrequest.split('|')[0] if DEBUG_MODE: servicelogger.log("Now handling call: " + callname) if callname not in API_dict: raise nmAPI.BadRequest("Unknown Call") # find the entry that describes this call... numberofargs, permissiontype, APIfunction = API_dict[callname] # we'll do the signature checks first... (the signature needs to be stripped # off to get the args anyways)... if permissiontype == 'Public': # There should be no signature, so this is the raw request... if len(fullrequest.split('|')) < numberofargs - 1: raise nmAPI.BadRequest("Not Enough Arguments") # If there are 3 args, we want to split at most 3 times (the first item is # the callname) callargs = fullrequest.split('|', numberofargs) # return any output for the user... return APIfunction(*callargs[1:]) else: # strip off the signature and get the requestdata requestdata, requestsignature = fastsigneddata.signeddata_split_signature( fullrequest) # NOTE: the first argument *must* be the vessel name!!!!!!!!!!! vesselname = requestdata.split('|', 2)[1] if vesselname not in nmAPI.vesseldict: raise nmAPI.BadRequest('Unknown Vessel') # I must have something to check... if permissiontype == 'Owner': # only the owner is allowed, so the list of keys is merely that key allowedkeys = [nmAPI.vesseldict[vesselname]['ownerkey']] else: # the user keys are also allowed allowedkeys = [nmAPI.vesseldict[vesselname]['ownerkey'] ] + nmAPI.vesseldict[vesselname]['userkeys'] # I need to pass the fullrequest in here... ensure_is_correctly_signed(fullrequest, allowedkeys, nmAPI.vesseldict[vesselname]['oldmetadata']) # If there are 3 args, we want to split at most 3 times (the first item is # the callname) callargs = requestdata.split('|', numberofargs) #store the request signature as old metadata nmAPI.vesseldict[vesselname]['oldmetadata'] = requestsignature # return any output for the user... return APIfunction(*callargs[1:])
def advertise_to_DNS(unique_id): """ Advertise unique_id to the zenodotus DNS server. We strip away whatever that follows the NAME_SERVER part of the unique_id. For instance, if our unique_id is abc.NAME_SERVER:1234@xyz, then we only advertise abc.NAME_SERVER. """ # IP that maps to the unique_id myip = emulcomm.getmyip() # Extract the part of unique_id up to the name server, # i.e. xyz.zenodotus.washington.edu, and discard whatever that follows name_server_pos = unique_id.find(NAME_SERVER) if name_server_pos > -1: unique_id = unique_id[0 : name_server_pos + len(NAME_SERVER)] else: raise Exception("Invalid unique_id format: '" + str(unique_id) + "'") advertise_success = False # We keep trying until successful advertisement (Fix for Ticket #956) while not advertise_success: try: advertise_announce(unique_id, myip, DNS_CACHE_TTL) servicelogger.log("[INFO]: Advertised " + str(unique_id) + " which maps to " + myip) advertise_success = True except Exception, error: if 'announce error' in str(error): # We can confidently drop the exception here. The advertisement service # can sometimes be flaky, yet it can guarantee advertisement of our # key-value pair on at least one of the three components. Thus, we are # printing the error message as a warning here. advertise_success = True else: advertise_success = False
def update_restrictions(): # Create an internal handler function, takes a resource line and returns the new number of threads def _internal_func(lineContents): try: threads = float(lineContents[2]) threads = threads * EVENT_SCALAR threads = int(threads) threads = max(threads, HARD_MIN) # Set a hard minimum return threads except: # On failure, return the minimum return HARD_MIN # Create a task that uses our internal function task = ("resource", "events", _internal_func, True) taskList = [task] # Process all the resource files errors = nmrestrictionsprocessor.process_all_files(taskList) # Log any errors we encounter if errors != []: for e in errors: print e servicelogger.log( "[ERROR]:Unable to patch events limit in resource file " + e[0] + ", exception " + str(e[1]) )
def update_restrictions(): # Create an internal handler function, takes a resource line and returns the new number of threads def _internal_func(lineContents): try: threads = float(lineContents[2]) threads = threads * EVENT_SCALAR threads = int(threads) threads = max(threads, HARD_MIN) # Set a hard minimum return threads except: # On failure, return the minimum return HARD_MIN # Create a task that uses our internal function task = ("resource", "events", _internal_func, True) taskList = [task] # Process all the resource files errors = nmrestrictionsprocessor.process_all_files(taskList) # Log any errors we encounter if errors != []: for e in errors: print e servicelogger.log( "[ERROR]:Unable to patch events limit in resource file " + e[0] + ", exception " + str(e[1]))
def uninstall_nokia(): """ <Purpose> Remove the startup script and symlink to it in the /etc/init.d and /etc/rc2.d directories, and kill all seattle processes by using stop_all_seattle_processes. This requires the user to be currently on root access. <Arguments> None. <Exceptions> None. <Side Effects> Removes the startup script and the symlink to it, and stops seattle from running. <Returns> True if succeeded in uninstalling, False otherwise. """ # Note to developers: If you need to change the path of the startup script or # the path of the symlink, make sure you keep it consistent with those in # seattleinstaller.py. startup_script_name = "nokia_seattle_startup.sh" # The directory where the startup script resides. startup_script_dir = "/etc/init.d/" # The full path to the startup script. startup_script_path = startup_script_dir + startup_script_name # The name of the symlink that links to the startup script. symlink_name = "S99startseattle" # The directory where the symlink to the startup script resides. symlink_dir = "/etc/rc2.d/" # The full path to the symlink. symlink_path = symlink_dir + symlink_name # Check if the startup script and the symlink exists. if not os.path.exists(startup_script_path) and \ not os.path.lexists(symlink_path): _output("Neither the startup script nor the symlink exists.") return True # Remove the startup script. try: os.remove(startup_script_path) # Cannot remove the startup script due to some reason. except OSError, e: # The startup script does not exist - that is fine, we will continue # and try to remove the symlink. if e.errno == errno.ENOENT: pass else: # The startup script cannot be removed. _output("The startup script cannot be removed. Make sure you have the " \ + "permission to do so.") servicelogger.log("Seattle cannot be uninstalled because " \ + startup_script_path + " cannot be removed.") return False
def set_accepter(accepter): global accepter_thread accepter_state['lock'].acquire(True) accepter_thread = accepter if DEBUG_MODE: servicelogger.log("[DEBUG] Accepter Thread has been set...") accepter_state['lock'].release()
def process_API_call(fullrequest): callname = fullrequest.split('|')[0] if DEBUG_MODE: servicelogger.log("Now handling call: " + callname) if callname not in API_dict: raise nmAPI.BadRequest("Unknown Call") # find the entry that describes this call... numberofargs, permissiontype, APIfunction = API_dict[callname] # we'll do the signature checks first... (the signature needs to be stripped # off to get the args anyways)... if permissiontype == 'Public': # There should be no signature, so this is the raw request... if len(fullrequest.split('|')) < numberofargs-1: raise nmAPI.BadRequest("Not Enough Arguments") # If there are 3 args, we want to split at most 3 times (the first item is # the callname) callargs = fullrequest.split('|',numberofargs) # return any output for the user... return APIfunction(*callargs[1:]) else: # strip off the signature and get the requestdata requestdata, requestsignature = fastsigneddata.signeddata_split_signature(fullrequest) # NOTE: the first argument *must* be the vessel name!!!!!!!!!!! vesselname = requestdata.split('|',2)[1] if vesselname not in nmAPI.vesseldict: raise nmAPI.BadRequest('Unknown Vessel') # I must have something to check... if permissiontype == 'Owner': # only the owner is allowed, so the list of keys is merely that key allowedkeys = [ nmAPI.vesseldict[vesselname]['ownerkey'] ] else: # the user keys are also allowed allowedkeys = [ nmAPI.vesseldict[vesselname]['ownerkey'] ] + nmAPI.vesseldict[vesselname]['userkeys'] # I need to pass the fullrequest in here... ensure_is_correctly_signed(fullrequest, allowedkeys, nmAPI.vesseldict[vesselname]['oldmetadata']) # If there are 3 args, we want to split at most 3 times (the first item is # the callname) callargs = requestdata.split('|',numberofargs) #store the request signature as old metadata nmAPI.vesseldict[vesselname]['oldmetadata'] = requestsignature # return any output for the user... return APIfunction(*callargs[1:])
def check_and_create_affix_object(virtual_host_name): """ <Purpose> The purpose of this function is to check if Affix has been enabled, If it is enabled, we create an Affix object with the advertised Affix string and return the Affix object as well as whether Affix is enabled. <Arguments> virtual_host_name - the zenodotus name we want to set for this node. <Exceptions> None <Return> Returns a Tuple in the form: (Boolean, AffixStackInterface, String) The first item in the tuple is whether Affix has been enabled. The second item is an AffixStackInterface object if Affix has been enabled. Otherwise the second item is None. The third item is the Affix string that is being used for the Affix object. """ global affix_stack_string global affix_enabled # Check to see if AFFIX is enabled. try: affix_enabled_lookup = advertise_lookup(enable_affix_key)[-1] # Now we check if the last entry is True or False. if affix_enabled_lookup == 'True': affix_stack_string = advertise_lookup(affix_service_key)[-1] affix_enabled = True servicelogger.log("[INFO]: Current advertised Affix string: " + str(affix_stack_string)) # If Affix is enabled, we can go ahead and create the Affix object # right away so we don't have to repeatedly create it in the # loop below. affix_legacy_string = "(CoordinationAffix)" + affix_stack_string affix_object = AffixStackInterface(affix_legacy_string, virtual_host_name) # Return the results. return (affix_enabled, affix_object, affix_legacy_string) else: affix_enabled = False # Affix is not enabled, so we return (False, None) return (affix_enabled, None, None) except (AdvertiseError, TimeoutError, ValueError, IndexError), e: servicelogger.log("Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise # Affix is not enabled, so we return (False, None) return (affix_enabled, None, None)
def uninstall_Windows(): """ <Purpose> Removes seattle from the Winodws registry startup key and/or the startup folder should either exist, then stops all seattle processes using stop_all_seattle_process.py <Arguments> None. <Exceptions> Possible IOError could be caused by filepath manipulation from a sub-function. SeattleNotInstalledError if seattle was not installed prior to uninstall. <Side Effects> Removes seattle from the Windows registry key and/or the Windows startup folder if it exists in either place. Stops seattle from running. <Returns> True if the uninstall succeeded. Currently, if uninstall fails, it must be because seattle was not installed prior to uninstall. We must return a boolean value for the parent function. """ # First see if seattle appears as a value in the Windows startup registry key, # and remove it if it exists. # removed_from_registry is used later and thus must have a value in case the # try: block below raises an exception. removed_from_registry = False try: removed_from_registry = remove_seattle_from_win_startup_registry() except WindowsError: print "The uninstaller does not have access to the Windows registry " \ + "startup keys. This means that seattle is likely not installed in " \ + "your Windows registry startup key, though you may want to " \ + "manually check the following registry keys and remove seattle " \ + "from those keys should it exist there: " print "HKEY_LOCAL_MACHINE\Software\Microsoft\Windows\CurrentVersion\Run" print "HKEY_CURRENT_USER\Software\Microsoft\Windows\CurrentVersion\Run" # Distinguish the above-printed text from what will be printed later by # by printing a blank line. print servicelogger.log(" uninstaller could not access the Windows registry " \ + "during this attempted uninstall.") # Next, see if there is a link to the seattle starter script in the startup # folder and remove it if it is there. if not WIN_STARTUP_SCRIPT_PATH == None: removed_from_startup_folder = \ remove_seattle_from_win_startup_folder() # Check to see if uninstall actually removed seattle from the computer. if not removed_from_registry and not removed_from_startup_folder: raise SeattleNotInstalledError("Seattle could not be detected as " \ + "having been installed prior to " \ + "uninstall.") elif removed_from_registry or removed_from_startup_folder: # Stop all instances of seattle from running before returning. stop_all_seattle_processes.main() return True
def start_accepter(): if AUTO_USE_NAT == False: # check to see if we should use the nat layer try: # see if we can currently have a bi-directional connection use_nat = nat_check_bi_directional(getmyip(), configuration['ports'][0]) except Exception,e: servicelogger.log("Exception occurred trying to contact forwarder to detect nat "+str(e)) use_nat = False
def parse_arguments(): """ Parse all the arguments passed in through the command line for the nodemanager. This way in the future it will be easy to add and remove options from the nodemanager. """ # Create the option parser parser = optparse.OptionParser(version="Seattle " + version) # Add the --foreground option. parser.add_option('--foreground', dest='foreground', action='store_true', default=False, help="Run the nodemanager in foreground " + "instead of daemonizing it.") # Add the --test-mode optino. parser.add_option('--test-mode', dest='test_mode', action='store_true', default=False, help="Run the nodemanager in test mode.") # Add the using shim capability. # --shims [shim name]: Forces use of the specified shims. The shim name must # conform to the format as specified in: # https://seattle.cs.washington.edu/wiki/UsingShims. parser.add_option('--shims', type="string", dest="shim_name", help="Use a user specified shim instead of the" + " default (NatDeciderShim)") # Parse the argumetns. options, args = parser.parse_args() # Set some global variables. global FOREGROUND global TEST_NM global default_shim # Analyze the options if options.foreground: FOREGROUND = True if options.test_mode: TEST_NM = True if options.shim_name: servicelogger.log("[INFO]: Using user-specified shims " + options.shim_name) default_shim = options.shim_name
def safe_log(message): """ Log a message in a way that cannot throw an exception. First try to log using the servicelogger, then just try to print the message. """ try: #f = open('/tmp/log.txt', 'a') #f.write(message + '\n') #f.close() servicelogger.log(message) except: pass
def log(*args): chunks = [] for arg in args: chunks.append(str(arg)) logstring = " ".join(chunks) # servicelogger.log will end a trailing newline to the string, # remove the existing one (if any). if logstring.endswith("\n"): servicelogger.log(logstring[:-1]) else: servicelogger.log(logstring)
def new_affix_listenforconnection(localip, localport, timeout=10): global affix_enabled global affix_stack_string global zenodotus_advertise_handle # Similarly, stop advertising my old Zenodotus name (if any), # ignoring potential errors. If any error is raised, it will # be recorded. try: advertisepipe.remove_from_pipe(zenodotus_advertise_handle) except Exception, err: servicelogger.log("Unexpected error when attempting to " + "remove old zenodotus_advertise_handle. " + str(type(err)) + ". " + str(err))
def run(self): # Run indefinitely. # This is on the assumption that getconnection() blocks, and so this won't consume an inordinate amount of resources. while True: try: ip, port, client_socket = self.serversocket.getconnection() connection_handler(ip, port, client_socket) except SocketWouldBlockError: sleep(0.5) except SocketTimeoutError: sleep(0.5) except Exception, e: servicelogger.log("FATAL error in AccepterThread: " + traceback.format_exc()) return
def safe_log(message): """ Log a message in a way that cannot throw an exception. First try to log using the servicelogger, then just try to print the message. """ try: servicelogger.log(message) except: try: print message except: # As the standard output streams aren't closed, it would seem that this # should never happen. If it does, though, what can we do to log the # message, other than directly write to a file? pass
def main(): global configuration if not FOREGROUND: # Background ourselves. daemon.daemonize() # ensure that only one instance is running at a time... gotlock = runonce.getprocesslock("seattlenodemanager") if gotlock == True: # I got the lock. All is well... pass else: if gotlock: servicelogger.log("[ERROR]:Another node manager process (pid: " + str(gotlock) + ") is running") else: servicelogger.log("[ERROR]:Another node manager process is running") return # I'll grab the necessary information first... servicelogger.log("[INFO]:Loading config") # BUG: Do this better? Is this the right way to engineer this? configuration = persist.restore_object("nodeman.cfg") # Armon: initialize the network restrictions initialize_ip_interface_restrictions(configuration) # ZACK BOKA: For Linux and Darwin systems, check to make sure that the new # seattle crontab entry has been installed in the crontab. # Do this here because the "nodeman.cfg" needs to have been read # into configuration via the persist module. if nonportable.ostype == 'Linux' or nonportable.ostype == 'Darwin': if 'crontab_updated_for_2009_installer' not in configuration or \ configuration['crontab_updated_for_2009_installer'] == False: try: import update_crontab_entry modified_crontab_entry = \ update_crontab_entry.modify_seattle_crontab_entry() # If updating the seattle crontab entry succeeded, then update the # 'crontab_updated_for_2009_installer' so the nodemanager no longer # tries to update the crontab entry when it starts up. if modified_crontab_entry: configuration['crontab_updated_for_2009_installer'] = True persist.commit_object(configuration,"nodeman.cfg") except Exception,e: exception_traceback_string = traceback.format_exc() servicelogger.log("[ERROR]: The following error occured when " \ + "modifying the crontab for the new 2009 " \ + "seattle crontab entry: " \ + exception_traceback_string)
def handle_request(socketobj): # always close the socketobj try: try: # let's get the request... # BUG: Should prevent endless data / slow retrival attacks fullrequest = session.session_recvmessage(socketobj) # Armon: Catch a vanilla exception because repy emulated_sockets # will raise Exception when the socket has been closed. # This is changed from just passing through socket.error, # which we were catching previously. except Exception, e: #JAC: Fix for the exception logging observed in #992 if 'Socket closed' in str(e) or 'timed out!' in str(e): servicelogger.log('Connection abruptly closed during recv') return elif 'Bad message size' in str(e): servicelogger.log('Received bad message size') return else: # I can't handle this, let's exit # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?) servicelogger.log_last_exception() return # handle the request as appropriate try: retstring = process_API_call(fullrequest) # Bad parameters, signatures, etc. except nmAPI.BadRequest,e: session.session_sendmessage(socketobj, str(e)+"\nError") return
def enable_affix(affix_string): """ <Purpose> Overload the listenforconnection() and getmyip() API call if Affix is enabled. <Arguments> None <SideEffects> Original listenforconnection() and getmyip() gets overwritten. <Exceptions> None """ # If Affix is not enabled, we just return. if not affix_enabled: return global timeout_listenforconnection global getmyip # Create my affix object and overwrite the listenforconnection # and the getmyip call. nodemanager_affix = affix_stack.AffixStack(affix_string) # Create a new timeout_listenforconnection that wraps a normal # Affix socket with timeout_server_socket. def new_timeout_listenforconnection(localip, localport, timeout): sockobj = nodemanager_affix.listenforconnection(localip, localport) return timeout_server_socket(sockobj, timeout) # Overload the two functionalities with Affix functionalities # that will be used later on. timeout_listenforconnection = new_timeout_listenforconnection getmyip = nodemanager_affix.getmyip servicelogger.log('[INFO] Nodemanager now using Affix string: ' + affix_string)
def start_accepter(): unique_id = rsa_publickey_to_string(configuration['publickey']) unique_id = sha_hexhash(unique_id) + str(configuration['service_vessel']) unique_id += "." + NAME_SERVER # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname else: for possibleport in configuration['ports']: try: servicelogger.log("[INFO]: Trying to wait") # We advertise the unique_id first so that we can perform waitforconn # on it later. It's tempting to do a waitforconn directly on the # current IP, but IPs are not unique. If we are behind a NAT, our IP # can be some private address which may have duplicates registered in # the NAT forwarder. As a result, a client may not be able to locate # us within the NAT forwarder. Hence, waitforconn must occur on a unique # resolvable name. advertise_to_DNS(unique_id) timeout_waitforconn(unique_id, possibleport, nmconnectionmanager.connection_handler, timeout=10, use_shim=True, shim_string=default_shim) except Exception, e: servicelogger.log("[ERROR]: when calling waitforconn for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: # the waitforconn was completed so the accepter is started accepter_state['lock'].acquire() accepter_state['started']= True accepter_state['lock'].release() # assign the nodemanager name myname = unique_id + ":" + str(possibleport) servicelogger.log("[INFO]: Now listening as " + myname) break else: servicelogger.log("[ERROR]: cannot find a port for waitforconn.")
node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log( "[ERROR] setting up nodemanager serversocket " + "on address " + bind_ip + ":" + str(possibleport) + ": " + repr(e)) servicelogger.log_last_exception() else: break else: # We exhausted the list of possibleport's to no avail. # Pause to avoid busy-waiting for the problem to go away. servicelogger.log( "[ERROR]: Could not create serversocket. Sleeping for 30 seconds." ) time.sleep(30) # check infrequently time.sleep(configuration['pollfrequency']) # has the thread started? def is_worker_thread_started(): for thread in threading.enumerate(): if 'WorkerThread' in str(thread): return True else: return False
def start_accepter(): global accepter_thread global affix_enabled global affix_stack_string global zenodotus_advertise_handle # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log( "Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Similarly, stop advertising my old Zenodotus name (if any), # ignoring potential errors. try: advertisepipe.remove_from_pipe(zenodotus_advertise_handle) except: pass # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # Check to see if AFFIX is enabled. try: affix_enabled_lookup = advertise_lookup( enable_affix_key)[-1] servicelogger.log("affix_enabled_lookup is " + str(affix_enabled_lookup)) # Now we check if the last entry is True or False. if affix_enabled_lookup == 'True': affix_stack_string = advertise_lookup( affix_service_key)[-1] affix_enabled = True servicelogger.log( "[INFO]: Current advertised Affix string: " + str(affix_stack_string)) else: affix_enabled = False except (AdvertiseError, TimeoutError), e: servicelogger.log( "Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise except ValueError: servicelogger.log( "Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise except IndexError: servicelogger.log( "Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) # This will occur if the advertise server returns an empty list. affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise # If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket. if affix_enabled: # Here we are going to use a for loop to find a second available port # for us to use for the LegacyAffix. Since the LegacyAffix opens up two # tcpserversocket, it needs two available ports. The first for a normal # repy listenforconnection call, the second for affix enabled # listenforconnection call. # We keep track of how many times we failed to listen with the Affix # framework. If we exceed 3, we default to Repy V2 API. Note that we # will try three times with each port, if we are unable to connect # with legacy Repy V2 API as well. fail_affix_count = 0 error_list = [] for affixportindex in range( portindex + 1, len(configuration['ports'])): affixport = configuration['ports'][affixportindex] # Assign the nodemanager name to be the nodekey. We replace any whitespace in the # name and append zenodotus tag at the end. mypubkey = rsa_publickey_to_string( configuration['publickey']).replace(" ", "") myname = sha_hexhash( mypubkey) + '.zenodotus.poly.edu' myname_port = myname + ":" + str(possibleport) # Announce my (new) Zenodotus name zenodotus_advertise_handle = advertisepipe.add_to_pipe( myname, getmyip()) affix_legacy_string = "(CoordinationAffix)(LegacyAffix," + myname + "," + str( affixport) + ",0," affix_legacy_string += "(CoordinationAffix)" + affix_stack_string + ")" affix_object = AffixStackInterface( affix_legacy_string) # Now that we have found the Affix string and have created the AffixStackInterface # object, we will try to open up a listening tcp socket. If we fail to do so # 3 times, we will default to legacy Repy V2 socket. try: serversocket = affix_object.listenforconnection( myname, possibleport) servicelogger.log( "[INFO]Started accepter thread with Affix string: " + affix_legacy_string) break except (AddressBindingError, AlreadyListeningError, DuplicateTupleError), e: servicelogger.log( "Failed to open listening socket with Affix on port: " + str(affixport) + ". Found error: " + str(e)) fail_affix_count += 1 error_list.append((type(e), str(e))) # If we fail more than 2 times, we will stop attempting to try listening # on a socket with the Affix framework. if fail_affix_count > 2: servicelogger.log( "Failed to open socket using Affix after three attemps." + "Now resuming with legacy Repy socket. Errors were: " + str(error_list)) serversocket = timeout_listenforconnection( bind_ip, possibleport, 10) # assign the nodemanager name myname_port = str(bind_ip) + ":" + str( possibleport) break except Exception, e: servicelogger.log( "[ERROR] Found Listenforconnection had exception: " + str(e)) raise else: # If AFFIX is not enabled, then we open up a normal tcpserversocket. # For now, we'll use the second method. serversocket = timeout_listenforconnection( bind_ip, possibleport, 10) # assign the nodemanager name myname_port = str(bind_ip) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False
def run(self): # Put everything in a try except block so that if badness happens, we can # log it before dying. try: while True: # remove stale items from the advertise dict. This is important because # we're using membership in the dict to indicate a need to advertise clean_advertise_dict() # this list contains the keys we will advertise advertisekeylist = [] # JAC: advertise under the node's key if rsa_publickey_to_string(self.nodekey) not in lastadvertisedict and self.nodekey not in advertisekeylist: advertisekeylist.append(self.nodekey) # make a copy so there isn't an issue with a race for vesselname in self.addict.keys()[:]: try: thisentry = self.addict[vesselname].copy() except KeyError: # the entry must have been removed in the meantime. Skip it! continue # if I advertise the vessel... if thisentry['advertise']: # add the owner key if not there already... if rsa_publickey_to_string(thisentry['ownerkey']) not in lastadvertisedict and thisentry['ownerkey'] not in advertisekeylist: advertisekeylist.append(thisentry['ownerkey']) # and all user keys if not there already for userkey in thisentry['userkeys']: if rsa_publickey_to_string(userkey) not in lastadvertisedict and userkey not in advertisekeylist: advertisekeylist.append(userkey) # there should be no dups. assert(advertisekeylist == listops_uniq(advertisekeylist)) # now that I know who to announce to, send messages to annouce my IP and # port to all keys I support for advertisekey in advertisekeylist: try: advertise_announce(advertisekey, str(myname), adTTL) # mark when we advertise lastadvertisedict[rsa_publickey_to_string(advertisekey)] = getruntime() # If the announce succeeded, and node was offline, log info message # and switch it back to online mode. if self.is_offline: info_msg = 'Node is back online.' if self.error_count: info_msg += ' (Encountered ' + str(self.error_count) + \ ' advertise errors)' servicelogger.log('[INFO]: ' + info_msg) self.error_count = 0 self.is_offline = False except AdvertiseError, e: # If all announce requests failed, assume node has # gone offline, if str(e) == "None of the advertise services could be contacted": self.is_offline = True # Log an error message after every 'N' failures if (self.error_count % error_skip_count == 0): servicelogger.log('AdvertiseError occured, continuing: '+str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log('AdvertiseError occured, continuing: '+str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return
# We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() for possibleport in configuration['ports']: try: if use_nat: # use the sha hash of the nodes public key with the vessel # number as an id for this node unique_id = rsa_publickey_to_string( configuration['publickey']) hashedunique_id = sha.new(unique_id).hexdigest() advertiseid = hashedunique_id + str( configuration['service_vessel']) servicelogger.log("[INFO]: Trying NAT wait") nat_waitforconn(advertiseid, possibleport, nmconnectionmanager.connection_handler) # do a local waitforconn (not using a forwarder) # this makes the node manager easily accessible locally #JAC: I do a timeout waitforconn in an attempt to address #881 # 10 seconds should be adequate for a client to respond / communicate timeout_waitforconn(bind_ip, possibleport, nmconnectionmanager.connection_handler, timeout=10) # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e:
def main(): global configuration if not FOREGROUND: # Background ourselves. daemon.daemonize() # Check if we are running in testmode. if TEST_NM: nodemanager_pid = os.getpid() servicelogger.log( "[INFO]: Running nodemanager in test mode on port 1224, " + "pid %s." % str(nodemanager_pid)) nodeman_pid_file = open(os.path.join(os.getcwd(), 'nodemanager.pid'), 'w') # Write out the pid of the nodemanager process that we started to a file. # This is only done if the nodemanager was started in test mode. try: nodeman_pid_file.write(str(nodemanager_pid)) finally: nodeman_pid_file.close() else: # ensure that only one instance is running at a time... gotlock = runonce.getprocesslock("seattlenodemanager") if gotlock == True: # I got the lock. All is well... pass else: if gotlock: servicelogger.log( "[ERROR]:Another node manager process (pid: " + str(gotlock) + ") is running") else: servicelogger.log( "[ERROR]:Another node manager process is running") return servicelogger.log('[INFO]: This is Seattle release "' + version + "'") # Feature add for #1031: Log information about the system in the nm log... servicelogger.log('[INFO]:platform.python_version(): "' + str(platform.python_version()) + '"') servicelogger.log('[INFO]:platform.platform(): "' + str(platform.platform()) + '"') # uname on Android only yields 'Linux', let's be more specific. try: import android servicelogger.log('[INFO]:platform.uname(): Android / "' + str(platform.uname()) + '"') except ImportError: servicelogger.log('[INFO]:platform.uname(): "' + str(platform.uname()) + '"') # I'll grab the necessary information first... servicelogger.log("[INFO]:Loading config") # BUG: Do this better? Is this the right way to engineer this? configuration = persist.restore_object("nodeman.cfg") # Armon: initialize the network restrictions initialize_ip_interface_restrictions(configuration) # ZACK BOKA: For Linux and Darwin systems, check to make sure that the new # seattle crontab entry has been installed in the crontab. # Do this here because the "nodeman.cfg" needs to have been read # into configuration via the persist module. if nonportable.ostype == 'Linux' or nonportable.ostype == 'Darwin': if 'crontab_updated_for_2009_installer' not in configuration or \ configuration['crontab_updated_for_2009_installer'] == False: try: # crontab may not exist on Android, therefore let's not check # if we are running on Android. See #1302 and #1254. try: import android except ImportError: import update_crontab_entry modified_crontab_entry = \ update_crontab_entry.modify_seattle_crontab_entry() # If updating the seattle crontab entry succeeded, then update the # 'crontab_updated_for_2009_installer' so the nodemanager no longer # tries to update the crontab entry when it starts up. if modified_crontab_entry: configuration[ 'crontab_updated_for_2009_installer'] = True persist.commit_object(configuration, "nodeman.cfg") except Exception, e: exception_traceback_string = traceback.format_exc() servicelogger.log("[ERROR]: The following error occured when " \ + "modifying the crontab for the new 2009 " \ + "seattle crontab entry: " \ + exception_traceback_string)
def start_accepter(): global accepter_thread global affix_enabled global affix_stack_string # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # Check to see if AFFIX is enabled. try: affix_enabled_lookup = advertise_lookup(enable_affix_key)[-1] servicelogger.log("affix_enabled_lookup is " + str(affix_enabled_lookup)) # Now we check if the last entry is True or False. if affix_enabled_lookup == 'True': affix_stack_string = advertise_lookup(affix_service_key)[-1] affix_enabled = True servicelogger.log("[INFO]: Current advertised Affix string: " + str(affix_stack_string)) else: affix_enabled = False except (AdvertiseError, TimeoutError), e: servicelogger.log("Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise except ValueError: servicelogger.log("Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise except IndexError: servicelogger.log("Trying to look up Affix enabled threw " + str(type(e)) + " " + str(e)) # This will occur if the advertise server returns an empty list. affix_enabled = False # Raise error on debug mode. if DEBUG_MODE: raise # If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket. if affix_enabled: # Here we are going to use a for loop to find a second available port # for us to use for the LegacyAffix. Since the LegacyAffix opens up two # tcpserversocket, it needs two available ports. The first for a normal # repy listenforconnection call, the second for affix enabled # listenforconnection call. # We keep track of how many times we failed to listen with the Affix # framework. If we exceed 3, we default to Repy V2 API. Note that we # will try three times with each port, if we are unable to connect # with legacy Repy V2 API as well. fail_affix_count = 0 error_list = [] for affixportindex in range(portindex+1, len(configuration['ports'])): affixport = configuration['ports'][affixportindex] # Assign the nodemanager name to be the nodekey. We replace any whitespace in the # name and append zenodotus tag at the end. mypubkey = rsa_publickey_to_string(configuration['publickey']).replace(" ", "") myname = sha_hexhash(mypubkey) + '.zenodotus.poly.edu' myname_port = myname + ":" + str(possibleport) # Announce my Zenodotus name # XXX Save the handle, modify the announcement when my address changes! advertisepipe.add_to_pipe(myname, getmyip()) affix_legacy_string = "(CoordinationAffix)(LegacyAffix," + myname + "," + str(affixport) + ",0," affix_legacy_string += "(CoordinationAffix)" + affix_stack_string + ")" affix_object = AffixStackInterface(affix_legacy_string) # Now that we have found the Affix string and have created the AffixStackInterface # object, we will try to open up a listening tcp socket. If we fail to do so # 3 times, we will default to legacy Repy V2 socket. try: serversocket = affix_object.listenforconnection(myname, possibleport) servicelogger.log("[INFO]Started accepter thread with Affix string: " + affix_legacy_string) break except (AddressBindingError, AlreadyListeningError, DuplicateTupleError), e: servicelogger.log( "Failed to open listening socket with Affix on port: " + str(affixport) + ". Found error: " + str(e)) fail_affix_count += 1 error_list.append((type(e), str(e))) # If we fail more than 2 times, we will stop attempting to try listening # on a socket with the Affix framework. if fail_affix_count > 2: servicelogger.log("Failed to open socket using Affix after three attemps." + "Now resuming with legacy Repy socket. Errors were: " + str(error_list)) serversocket = timeout_listenforconnection(bind_ip, possibleport, 10) # assign the nodemanager name myname_port = str(bind_ip) + ":" + str(possibleport) break except Exception, e: servicelogger.log("[ERROR] Found Listenforconnection had exception: " + str(e)) raise else: # If AFFIX is not enabled, then we open up a normal tcpserversocket. # For now, we'll use the second method. serversocket = timeout_listenforconnection(bind_ip, possibleport,10) # assign the nodemanager name myname_port = str(bind_ip) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False
def start_accepter(): global accepter_thread global affix_enabled global affix_stack_string # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname else: # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # Check to see if AFFIX is enabled. try: affix_enabled_lookup = advertise_lookup(enable_affix_key)[-1] # Now we check if the last entry is True or False. if affix_enabled_lookup == 'True': affix_stack_string = advertise_lookup(affix_service_key)[-1] affix_enabled = True else: affix_enabled = False except AdvertiseError: affix_enabled = False except ValueError: affix_enabled = False except IndexError: # This will occur if the advertise server returns an empty list. affix_enabled = False # If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket. if affix_enabled: # Here we are going to use a for loop to find a second available port # for us to use for the LegacyShim. Since the LegacyShim opens up two # tcpserversocket, it needs two available ports. The first for a normal # repy listenforconnection call, the second for shim enabled # listenforconnection call. for shimportindex in range(portindex+1, len(configuration['ports'])): shimport = configuration['ports'][shimportindex] affix_legacy_string = "(LegacyShim," + str(shimport) + ",0)" + affix_stack_string affix_object = ShimStackInterface(affix_legacy_string) serversocket = affix_object.listenforconnection(bind_ip, possibleport) servicelogger.log("[INFO]Started accepter thread with Affix string: " + affix_legacy_string) break else: # This is the case if we weren't able to find any port to listen on # With the legacy shim. raise ShimError("Unable to create create tcpserversocket with shims using port:" + str(possibleport)) else: # If AFFIX is not enabled, then we open up a normal tcpserversocket. # For now, we'll use the second method. serversocket = listenforconnection(bind_ip, possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: # assign the nodemanager name myname = str(bind_ip) + ":" + str(possibleport) break else: servicelogger.log("[ERROR]: cannot find a port for recvmess")
raise else: # We succeeded in getting our external IP. Leave the loop. break time.sleep(0.1) vesseldict = nmrequesthandler.initialize(myip, configuration['publickey'], version) # Start accepter... myname = start_accepter() # Initialize the global node name inside node reset configuration dict node_reset_config['name'] = myname #send our advertised name to the log servicelogger.log('myname = '+str(myname)) # Start worker thread... start_worker_thread(configuration['pollfrequency']) # Start advert thread... start_advert_thread(vesseldict, myname, configuration['publickey']) # Start status thread... start_status_thread(vesseldict,configuration['pollfrequency']) # we should be all set up now. servicelogger.log("[INFO]:Started")
def start_accepter(): global accepter_thread # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log("Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for portindex in range(len(configuration['ports'])): possibleport = configuration['ports'][portindex] try: # There are two possible implementations available here: # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881 # 2) Use a repy socket, but then possibly leak many connections. # For now, we'll use the second method and use the sockettimeout # library so we can still use a timeout to ensure we don't have # any malicious clients that feed us endless data (or no data) # to tie up the connection. Note that if we are using Affix, # we will be using a TimeoutAffix to achieve the equivalent # outcome. serversocket = timeout_listenforconnection(bind_ip, possibleport,10) # assign the nodemanager name. # We re-retrieve our address using getmyip as we may now be using # a zenodotus name instead. myname_port = str(getmyip()) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: break else: servicelogger.log("[ERROR]: cannot find a port for recvmess")
def handle_threading_error(nmAPI): """ <Purpose> Handles a repy node failing with ThreadErr. Reduces global thread count by 50%. Restarts all existing vesselts <Arguments> nmAPI: the nmAPI module -- passed to the function to avoid import loops; see ticket #590 for more information about this. """ # Make a log of this servicelogger.log( "[ERROR]:A Repy vessel has exited with ThreadErr status. Patching restrictions and reseting all vessels." ) # Get the number of threads Repy has allocated allocatedThreads = get_allocated_threads() # Get the number os system threads currently systemThreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[ERROR]:System Threads: " + str(systemThreads) + " Repy Allocated Threads: " + str(allocatedThreads)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allocatedThreads < systemThreads * threshold: return # We are continuing, so we are above the threshold! # First, update the restrictions update_restrictions() # Then, stop the vessels # Get all the vessels vessels = nmAPI.vesseldict.keys() # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vessel in vessels: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vessel, stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log( "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp)) servicelogger.log_last_exception()
+ startup_script_path + " cannot be removed.") return False # Remove the symlink. try: os.remove(symlink_path) # Cannot remove the symlink due to some reason. except OSError, e: # The symlink does not exist - that is fine. if e.errno == errno.ENOENT: pass else: # The symlink cannot be removed. _output("The symlink cannot be removed. Make sure you have the " \ + "permission to do so.") servicelogger.log("Seattle cannot be uninstalled because " \ + symlink_path + " cannot be removed.") return False # Stop all instances of seattle from running. stop_all_seattle_processes.main() return True def uninstall_Linux_and_Mac(): """ <Purpose> Remove the seattle entry from the crontab, and kill all seattle processes by using stop_all_seattle_processes.py <Arguments> None.
# MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR] setting up nodemanager serversocket " + "on address " + bind_ip + ":" + str(possibleport) + ": " + repr(e)) servicelogger.log_last_exception() else: break else: # We exhausted the list of possibleport's to no avail. # Pause to avoid busy-waiting for the problem to go away. servicelogger.log("[ERROR]: Could not create serversocket. Sleeping for 30 seconds.") time.sleep(30) # check infrequently time.sleep(configuration['pollfrequency']) # has the thread started? def is_worker_thread_started(): for thread in threading.enumerate(): if 'WorkerThread' in str(thread):
# If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket. if affix_enabled: # Assign the nodemanager name to be the nodekey. We replace any whitespace in the # name and append zenodotus tag at the end. # Announce my (new) Zenodotus name zenodotus_advertise_handle = advertisepipe.add_to_pipe(my_zeno_name, emulcomm.getmyip()) # Now that we have found the Affix string and have created the AffixStackInterface # object, we will try to open up a listening tcp socket. If we get an error, we # fall back to using legacy Repy API. try: serversocket = affix_object.listenforconnection(my_zeno_name, localport) servicelogger.log("[INFO]Started accepter thread with Affix string: " + affix_legacy_string) except (AddressBindingError, AlreadyListeningError, DuplicateTupleError): servicelogger.log("Failed to open listening socket with Affix on port: " + str(localport) + ". Found error: " + str(e) + ". Trying legacy connection.") return old_timeout_listenforconnection(localip, localport, 10) else: # If we did not receive any error, we need to overwrite getmyip() with # the new address. global getmyip getmyip = affix_object.getmyip else: # If Affix is not enaled, we do a normal timeout_listenforconnection # and return the socket. return old_timeout_listenforconnection(localip, localport, 10)
def run(self): # Put everything in a try except block so that if badness happens, we can # log it before dying. try: while True: # remove stale items from the advertise dict. This is important because # we're using membership in the dict to indicate a need to advertise clean_advertise_dict() # this list contains the keys we will advertise advertisekeylist = [] # JAC: advertise under the node's key if rsa_publickey_to_string( self.nodekey ) not in lastadvertisedict and self.nodekey not in advertisekeylist: advertisekeylist.append(self.nodekey) # make a copy so there isn't an issue with a race for vesselname in self.addict.keys()[:]: try: thisentry = self.addict[vesselname].copy() except KeyError: # the entry must have been removed in the meantime. Skip it! continue # if I advertise the vessel... if thisentry['advertise']: # add the owner key if not there already... if rsa_publickey_to_string( thisentry['ownerkey'] ) not in lastadvertisedict and thisentry[ 'ownerkey'] not in advertisekeylist: advertisekeylist.append(thisentry['ownerkey']) # and all user keys if not there already for userkey in thisentry['userkeys']: if rsa_publickey_to_string( userkey ) not in lastadvertisedict and userkey not in advertisekeylist: advertisekeylist.append(userkey) # there should be no dups. assert (advertisekeylist == listops_uniq(advertisekeylist)) # now that I know who to announce to, send messages to annouce my IP and # port to all keys I support for advertisekey in advertisekeylist: try: advertise_announce(advertisekey, str(myname), adTTL) # mark when we advertise lastadvertisedict[rsa_publickey_to_string( advertisekey)] = getruntime() # If the announce succeeded, and node was offline, log info message # and switch it back to online mode. if self.is_offline: info_msg = 'Node is back online.' if self.error_count: info_msg += ' (Encountered ' + str(self.error_count) + \ ' advertise errors)' servicelogger.log('[INFO]: ' + info_msg) self.error_count = 0 self.is_offline = False except AdvertiseError, e: # If all announce requests failed, assume node has # gone offline, if str( e ) == "None of the advertise services could be contacted": self.is_offline = True # Log an error message after every 'N' failures if (self.error_count % error_skip_count == 0): servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return
def start_accepter(): global accepter_thread # do this until we get the accepter started... while True: if not node_reset_config['reset_accepter'] and is_accepter_started(): # we're done, return the name! return myname_port else: # If we came here because a reset was initiated, kill the old # accepter thread server socket before starting a new one. try: accepter_thread.close_serversocket() servicelogger.log( "Closed previous accepter thread server socket.") except: # There was no accepter_thread, or it couldn't .close_serversocket(). # No problem -- this means nothing will be in the way of the new # serversocket. pass # Use getmyip() to find the IP address the nodemanager should # listen on for incoming connections. This will work correctly # if IP/interface preferences have been set. # We only want to call getmyip() once rather than in the loop # since this potentially avoids rebuilding the allowed IP # cache for each possible port bind_ip = getmyip() # Attempt to have the nodemanager listen on an available port. # Once it is able to listen, create a new thread and pass it the socket. # That new thread will be responsible for handling all of the incoming connections. for possibleport in configuration['ports']: try: # Use a Repy socket for listening. This lets us override # the listenforconnection function with a version using an # Affix stack easily; furthermore, we can transparently use # the Repy sockettimeout library to protect against malicious # clients that feed us endless data (or no data) to tie up # the connection. try: serversocket = timeout_listenforconnection( bind_ip, possibleport, 10) except (AlreadyListeningError, DuplicateTupleError), e: # These are rather dull errors that will result in us # trying a different port. Don't print a stack trace. servicelogger.log( "[ERROR]: listenforconnection for address " + bind_ip + ":" + str(possibleport) + " failed with error '" + repr(e) + "'. Retrying.") continue # Assign the nodemanager name. # We re-retrieve our address using getmyip as we may now be using # a zenodotus name instead. myname_port = str(getmyip()) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log( "[ERROR] setting up nodemanager serversocket " + "on address " + bind_ip + ":" + str(possibleport) + ": " + repr(e)) servicelogger.log_last_exception() else: break
# It wasn't emulcomm.getmyip()'s exception. re-raise. raise else: # We succeeded in getting our external IP. Leave the loop. break time.sleep(0.1) vesseldict = nmrequesthandler.initialize(myip, configuration['publickey'], version) # Start accepter... myname = start_accepter() # Initialize the global node name inside node reset configuration dict node_reset_config['name'] = myname #send our advertised name to the log servicelogger.log('myname = '+str(myname)) # Start worker thread... start_worker_thread(configuration['pollfrequency']) # Start advert thread... start_advert_thread(vesseldict, myname, configuration['publickey']) # Start status thread... start_status_thread(vesseldict,configuration['pollfrequency']) # we should be all set up now. servicelogger.log("[INFO]:Started")
return # Other exceptions only should happen on an internal error and should be # captured by servicelogger.log except Exception,e: servicelogger.log_last_exception() session.session_sendmessage(socketobj,"Internal Error\nError") return # send the output of the command... session.session_sendmessage(socketobj,retstring) except Exception, e: #JAC: Fix for the exception logging observed in #992 if 'Socket closed' in str(e) or 'timed out!' in str(e): servicelogger.log('Connection abruptly closed in send') return else: raise finally: # Prevent leaks try: socketobj.close() except Exception, e: servicelogger.log_last_exception()
def main(): global configuration if not FOREGROUND: # Background ourselves. daemon.daemonize() # Check if we are running in testmode. if TEST_NM: nodemanager_pid = os.getpid() servicelogger.log("[INFO]: Running nodemanager in test mode on port 1224, "+ "pid %s." % str(nodemanager_pid)) nodeman_pid_file = open(os.path.join(os.getcwd(), 'nodemanager.pid'), 'w') # Write out the pid of the nodemanager process that we started to a file. # This is only done if the nodemanager was started in test mode. try: nodeman_pid_file.write(str(nodemanager_pid)) finally: nodeman_pid_file.close() else: # ensure that only one instance is running at a time... gotlock = runonce.getprocesslock("seattlenodemanager") if gotlock == True: # I got the lock. All is well... pass else: if gotlock: servicelogger.log("[ERROR]:Another node manager process (pid: " + str(gotlock) + ") is running") else: servicelogger.log("[ERROR]:Another node manager process is running") return servicelogger.log('[INFO]: This is Seattle release "' + version + "'") # Feature add for #1031: Log information about the system in the nm log... servicelogger.log('[INFO]:platform.python_version(): "' + str(platform.python_version())+'"') servicelogger.log('[INFO]:platform.platform(): "' + str(platform.platform())+'"') # uname on Android only yields 'Linux', let's be more specific. try: import android servicelogger.log('[INFO]:platform.uname(): Android / "' + str(platform.uname())+'"') except ImportError: servicelogger.log('[INFO]:platform.uname(): "'+str(platform.uname())+'"') # I'll grab the necessary information first... servicelogger.log("[INFO]:Loading config") # BUG: Do this better? Is this the right way to engineer this? configuration = persist.restore_object("nodeman.cfg") # If Seattle is not installed, the nodemanager will have no vesseldict # and an incomplete config. Log this problem and exit. try: if configuration["seattle_installed"] is not True: servicelogger.log("[ERROR]:Seattle is not installed. Run the Seattle installer to create the required configuration files before starting the nodemanager. Exiting.") harshexit.harshexit(10) except KeyError: # There isn't even a "seattle_installed" entry in this dict!? servicelogger.log("[ERROR]:The nodemanager configuration, nodeman.cfg, is corrupt. Exiting.") harshexit.harshexit(11) # Armon: initialize the network restrictions initialize_ip_interface_restrictions(configuration) # Enable Affix and overload various Repy network API calls # with Affix-enabled calls. # Use the node's publickey to generate a name for our node. mypubkey = rsa_publickey_to_string(configuration['publickey']).replace(" ", "") affix_stack_name = sha_hexhash(mypubkey) enable_affix('(CoordinationAffix)(MakeMeHearAffix)(NamingAndResolverAffix,' + affix_stack_name + ')') # get the external IP address... myip = None while True: try: # Try to find our external IP. myip = emulcomm.getmyip() except Exception, e: # Replace with InternetConnectivityError ? # If we aren't connected to the internet, emulcomm.getmyip() raises this: if len(e.args) >= 1 and e.args[0] == "Cannot detect a connection to the Internet.": # So we try again. pass else: # It wasn't emulcomm.getmyip()'s exception. re-raise. raise else: # We succeeded in getting our external IP. Leave the loop. break time.sleep(0.1)
def handle_threading_error(): """ <Purpose> Handles a repy node failing with ThreadErr. If repy is allowed to use more than 10% of the current threads, reduce the global thread count by 50% and stop all existing vessels <Arguments> None <Exceptions> None <Side Effects> May re-write all resource files and stop all vessels <Returns> None """ # Make a log of this servicelogger.log( "[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step" ) # Get all the names of the vessels vesselnamelist = nmAPI.vesseldict.keys() # read in all of the resource files so that we can look at and possibly # manipulate them. resourcedicts = {} for vesselname in vesselnamelist: resourcedicts[ vesselname] = resourcemanipulation.read_resourcedict_from_file( 'resource.' + vesselname) # Get the number of threads Repy has allocated allowedthreadcount = 0 for vesselname in vesselnamelist: allowedthreadcount = allowedthreadcount + resourcedicts[vesselname][ 'events'] # Get the total number os system threads currently used totalusedthreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[WARNING]:System Threads: " + str(totalusedthreads) + " Repy Allocated Threads: " + str(allowedthreadcount)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allowedthreadcount < totalusedthreads * threshold: return servicelogger.log("[ERROR]:Reducing number of system threads!") #### We are above the threshold! Let's cut everything by 1/2 # First, update the resource files for vesselname in vesselnamelist: # cut the events by 1/2 resourcedicts[vesselname][ 'events'] = resourcedicts[vesselname]['events'] / 2 # write out the new resource files... resourcemanipulation.write_resourcedict_to_file( resourcedicts[vesselname], 'resource.' + vesselname) # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vesselname in vesselnamelist: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vesselname, stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log( "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp)) servicelogger.log_last_exception()
else: # Just use getmyip(), this is the default behavior and will work if we have preferences set # We only want to call getmyip() once, rather than in the loop since this potentially avoids # rebuilding the allowed IP cache for each possible port bind_ip = emulcomm.getmyip() for possibleport in configuration['ports']: try: if use_nat: # use the sha hash of the nodes public key with the vessel # number as an id for this node unique_id = rsa_publickey_to_string(configuration['publickey']) unique_id = sha_hexhash(unique_id) unique_id = unique_id+str(configuration['service_vessel']) servicelogger.log("[INFO]: Trying NAT wait") nat_waitforconn(unique_id, possibleport, nmconnectionmanager.connection_handler) # do a local waitforconn (not using a forowarder) # this makes the node manager easily accessible locally waitforconn(bind_ip, possibleport, nmconnectionmanager.connection_handler) except Exception, e: servicelogger.log("[ERROR]: when calling waitforconn for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: # the waitforconn was completed so the acceptor is started acceptor_state['lock'].acquire() acceptor_state['started']= True
if (self.error_count % error_skip_count == 0): servicelogger.log('AdvertiseError occured, continuing: '+str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log('AdvertiseError occured, continuing: '+str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return # wait to avoid sending too frequently time.sleep(adsleepfrequency) except Exception, e: exceptionstring = "[ERROR]:" (etype, value, tb) = sys.exc_info() for line in traceback.format_tb(tb): exceptionstring = exceptionstring + line # log the exception that occurred. exceptionstring = exceptionstring + str(etype)+" "+str(value)+"\n" servicelogger.log(exceptionstring) raise e
def handle_threading_error(): """ <Purpose> Handles a repy node failing with ThreadErr. If repy is allowed to use more than 10% of the current threads, reduce the global thread count by 50% and stop all existing vessels <Arguments> None <Exceptions> None <Side Effects> May re-write all resource files and stop all vessels <Returns> None """ # Make a log of this servicelogger.log("[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step") # Get all the names of the vessels vesselnamelist = nmAPI.vesseldict.keys() # read in all of the resource files so that we can look at and possibly # manipulate them. resourcedicts = {} for vesselname in vesselnamelist: resourcedicts[vesselname] = resourcemanipulation.read_resourcedict_from_file('resource.'+vesselname) # Get the number of threads Repy has allocated allowedthreadcount = 0 for vesselname in vesselnamelist: allowedthreadcount = allowedthreadcount + resourcedicts[vesselname]['events'] # Get the total number os system threads currently used totalusedthreads = nonportable.os_api.get_system_thread_count() # Log this information servicelogger.log("[WARNING]:System Threads: "+str(totalusedthreads)+" Repy Allocated Threads: "+str(allowedthreadcount)) # Get the NM configuration configuration = persist.restore_object("nodeman.cfg") # Check if there is a threshold configuration, # otherwise add the default configuration if NOOP_CONFIG_KEY in configuration: threshold = configuration[NOOP_CONFIG_KEY] else: threshold = DEFAULT_NOOP_THRESHOLD configuration[NOOP_CONFIG_KEY] = threshold persist.commit_object(configuration, "nodeman.cfg") # Check if we are below the threshold, if so # then just return, this is a noop if allowedthreadcount < totalusedthreads * threshold: return servicelogger.log("[ERROR]:Reducing number of system threads!") #### We are above the threshold! Let's cut everything by 1/2 # First, update the resource files for vesselname in vesselnamelist: # cut the events by 1/2 resourcedicts[vesselname]['events'] = resourcedicts[vesselname]['events'] / 2 # write out the new resource files... resourcemanipulation.write_resourcedict_to_file(resourcedicts[vesselname], 'resource.'+vesselname) # Create the stop tuple, exit code 57 with an error message stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.") # Stop each vessel for vesselname in vesselnamelist: try: # Stop each vessel, using our stoptuple nmAPI.stopvessel(vesselname,stoptuple) except Exception, exp: # Forge on, regardless of errors servicelogger.log("[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: "+str(exp)) servicelogger.log_last_exception()
# assign the nodemanager name myname_port = str(bind_ip) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: break else: servicelogger.log("[ERROR]: cannot find a port for recvmess") # check infrequently time.sleep(configuration['pollfrequency'])
myname_port = str(bind_ip) + ":" + str(possibleport) # If there is no error, we were able to successfully start listening. # Create the thread, and start it up! accepter = nmconnectionmanager.AccepterThread(serversocket) accepter.start() # Now that we created an accepter, let's use it! set_accepter(accepter) # MOSHE: Is this thread safe!? # Now that waitforconn has been called, unset the accepter reset flag node_reset_config['reset_accepter'] = False except Exception, e: # print bind_ip, port, e servicelogger.log( "[ERROR]: when calling listenforconnection for the connection_handler: " + str(e)) servicelogger.log_last_exception() else: break else: servicelogger.log("[ERROR]: cannot find a port for recvmess") # check infrequently time.sleep(configuration['pollfrequency']) # has the thread started? def is_worker_thread_started(): for thread in threading.enumerate():
def main(): global configuration if not FOREGROUND: # Background ourselves. daemon.daemonize() # Check if we are running in testmode. if TEST_NM: nodemanager_pid = os.getpid() servicelogger.log("[INFO]: Running nodemanager in test mode on port <nodemanager_port>, "+ "pid %s." % str(nodemanager_pid)) nodeman_pid_file = open(os.path.join(os.getcwd(), 'nodemanager.pid'), 'w') # Write out the pid of the nodemanager process that we started to a file. # This is only done if the nodemanager was started in test mode. try: nodeman_pid_file.write(str(nodemanager_pid)) finally: nodeman_pid_file.close() else: # ensure that only one instance is running at a time... gotlock = runonce.getprocesslock("seattlenodemanager") if gotlock == True: # I got the lock. All is well... pass else: if gotlock: servicelogger.log("[ERROR]:Another node manager process (pid: " + str(gotlock) + ") is running") else: servicelogger.log("[ERROR]:Another node manager process is running") return # Feature add for #1031: Log information about the system in the nm log... servicelogger.log('[INFO]:platform.python_version(): "' + str(platform.python_version())+'"') servicelogger.log('[INFO]:platform.platform(): "' + str(platform.platform())+'"') # uname on Android only yields 'Linux', let's be more specific. try: import android servicelogger.log('[INFO]:platform.uname(): Android / "' + str(platform.uname())+'"') except ImportError: servicelogger.log('[INFO]:platform.uname(): "'+str(platform.uname())+'"') # I'll grab the necessary information first... servicelogger.log("[INFO]:Loading config") # BUG: Do this better? Is this the right way to engineer this? configuration = persist.restore_object("nodeman.cfg") # Armon: initialize the network restrictions initialize_ip_interface_restrictions(configuration) # ZACK BOKA: For Linux and Darwin systems, check to make sure that the new # seattle crontab entry has been installed in the crontab. # Do this here because the "nodeman.cfg" needs to have been read # into configuration via the persist module. if nonportable.ostype == 'Linux' or nonportable.ostype == 'Darwin': if 'crontab_updated_for_2009_installer' not in configuration or \ configuration['crontab_updated_for_2009_installer'] == False: try: # crontab may not exist on Android, therefore let's not check # if we are running on Android. See #1302 and #1254. try: import android except ImportError: import update_crontab_entry modified_crontab_entry = \ update_crontab_entry.modify_seattle_crontab_entry() # If updating the seattle crontab entry succeeded, then update the # 'crontab_updated_for_2009_installer' so the nodemanager no longer # tries to update the crontab entry when it starts up. if modified_crontab_entry: configuration['crontab_updated_for_2009_installer'] = True persist.commit_object(configuration,"nodeman.cfg") except Exception,e: exception_traceback_string = traceback.format_exc() servicelogger.log("[ERROR]: The following error occured when " \ + "modifying the crontab for the new 2009 " \ + "seattle crontab entry: " \ + exception_traceback_string)
if (self.error_count % error_skip_count == 0): servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) self.error_count += 1 # Log all other types of errors else: servicelogger.log( 'AdvertiseError occured, continuing: ' + str(e)) except Exception, e: servicelogger.log_last_exception() # an unexpected exception occured, exit and restart return # wait to avoid sending too frequently time.sleep(adsleepfrequency) except Exception, e: exceptionstring = "[ERROR]:" (etype, value, tb) = sys.exc_info() for line in traceback.format_tb(tb): exceptionstring = exceptionstring + line # log the exception that occurred. exceptionstring = exceptionstring + str(etype) + " " + str( value) + "\n" servicelogger.log(exceptionstring) raise e