Esempio n. 1
0
def start_accepter():
  
  unique_id = rsa_publickey_to_string(configuration['publickey'])
  unique_id = sha_hexhash(unique_id) + str(configuration['service_vessel'])
  unique_id += "." + NAME_SERVER
 
  # do this until we get the accepter started...
  while True:

    if not node_reset_config['reset_accepter'] and is_accepter_started():
      # we're done, return the name!
      return myname
    
    else:
      for possibleport in configuration['ports']:
        try:
          servicelogger.log("[INFO]: Trying to wait")

          # We advertise the unique_id first so that we can perform waitforconn
          # on it later. It's tempting to do a waitforconn directly on the
          # current IP, but IPs are not unique. If we are behind a NAT, our IP
          # can be some private address which may have duplicates registered in
          # the NAT forwarder. As a result, a client may not be able to locate
          # us within the NAT forwarder. Hence, waitforconn must occur on a unique
          # resolvable name.
          advertise_to_DNS(unique_id)

          timeout_waitforconn(unique_id, possibleport,
                              nmconnectionmanager.connection_handler,
                              timeout=10, use_shim=True, shim_string=default_shim)

        except Exception, e:
          servicelogger.log("[ERROR]: when calling waitforconn for the connection_handler: " + str(e))
          servicelogger.log_last_exception()
        else:
          # the waitforconn was completed so the accepter is started
          accepter_state['lock'].acquire()
          accepter_state['started']= True
          accepter_state['lock'].release()

          # assign the nodemanager name
          myname = unique_id + ":" + str(possibleport)
          servicelogger.log("[INFO]: Now listening as " + myname)

          break

      else:
        servicelogger.log("[ERROR]: cannot find a port for waitforconn.")
  def run(self):
    try: 

      while True:
        # if there are any requests, add them to the dict.
        add_requests()
        
        if len(connection_dict)>0:
          # get the "first" request
          conn = pop_request()
          nmrequesthandler.handle_request(conn)
        else:
          # check at most twice a second (if nothing is new)
          time.sleep(self.sleeptime)

    except:
      servicelogger.log_last_exception()
      raise
    def run(self):
        try:

            while True:

                if len(connection_dict) > 0:
                    # get the "first" request
                    conn = pop_request()
                    # Removing this logging which seems excessive...
                    #          servicelogger.log('start handle_request:'+str(id(conn)))
                    nmrequesthandler.handle_request(conn)
                #          servicelogger.log('finish handle_request:'+str(id(conn)))
                else:
                    # check at most twice a second (if nothing is new)
                    time.sleep(self.sleeptime)

        except:
            servicelogger.log_last_exception()
            raise
  def run(self):
    try: 

      while True:
        
        if len(connection_dict)>0:
          # get the "first" request
          conn = pop_request()
# Removing this logging which seems excessive...          
#          servicelogger.log('start handle_request:'+str(id(conn)))
          nmrequesthandler.handle_request(conn)
#          servicelogger.log('finish handle_request:'+str(id(conn)))
        else:
          # check at most twice a second (if nothing is new)
          time.sleep(self.sleeptime)

    except:
      servicelogger.log_last_exception()
      raise
Esempio n. 5
0
def handle_request(socketobj):

    try:
        # let's get the request...
        # BUG: Should prevent endless data / slow retrival attacks
        fullrequest = session_recvmessage(socketobj)

    # Armon: Catch a vanilla exception because repy emulated_sockets
    # will raise Exception when the socket has been closed.
    # This is changed from just passing through socket.error,
    # which we were catching previously.
    except Exception, e:
        # close if possible
        safe_close(socketobj)

        # I can't handle this, let's exit
        # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?)
        servicelogger.log_last_exception()
        return
Esempio n. 6
0
def handle_request(socketobj):


  try:
    # let's get the request...
    # BUG: Should prevent endless data / slow retrival attacks
    fullrequest = session_recvmessage(socketobj)
  
  # Armon: Catch a vanilla exception because repy emulated_sockets
  # will raise Exception when the socket has been closed.
  # This is changed from just passing through socket.error,
  # which we were catching previously.
  except Exception, e:
    # close if possible
    safe_close(socketobj)

    # I can't handle this, let's exit
    # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?)
    servicelogger.log_last_exception()
    return
Esempio n. 7
0
def safe_log_last_exception():
  """
  Log the last exception in a way that cannot throw an exception. First try to
  log using the servicelogger, then just try to print the message.
  """
  try:
    # Get the last exception in case the servicelogger fails.
    exceptionstr = traceback.format_exc()
  except:
    pass
  
  try:
    servicelogger.log_last_exception()
  except:
    try:
      print exceptionstr
    except:
      # As the standard output streams aren't closed, it would seem that this
      # should never happen. If it does, though, what can we do to log the
      # message, other than directly write to a file?
      pass
Esempio n. 8
0
def safe_log_last_exception():
  """
  Log the last exception in a way that cannot throw an exception. First try to
  log using the servicelogger, then just try to print the message.
  """
  try:
    # Get the last exception in case the servicelogger fails.
    exceptionstr = traceback.format_exc()
  except:
    pass
  
  try:
    servicelogger.log_last_exception()
  except:
    try:
      print exceptionstr
    except:
      # As the standard output streams aren't closed, it would seem that this
      # should never happen. If it does, though, what can we do to log the
      # message, other than directly write to a file?
      pass
def handle_request(socketobj):

  # always close the socketobj
  try:


    try:
      # let's get the request...
      # BUG: Should prevent endless data / slow retrival attacks
      fullrequest = session.session_recvmessage(socketobj)
  
    # Armon: Catch a vanilla exception because repy emulated_sockets
    # will raise Exception when the socket has been closed.
    # This is changed from just passing through socket.error,
    # which we were catching previously.
    except Exception, e:

      #JAC: Fix for the exception logging observed in #992
      if 'Socket closed' in str(e) or 'timed out!' in str(e):
        servicelogger.log('Connection abruptly closed during recv')
        return
      elif 'Bad message size' in str(e):
        servicelogger.log('Received bad message size')
        return
      else:
        # I can't handle this, let's exit
        # BUG: REMOVE LOGGING IN PRODUCTION VERSION (?)
        servicelogger.log_last_exception()
        return



    # handle the request as appropriate
    try:
      retstring = process_API_call(fullrequest)

    # Bad parameters, signatures, etc.
    except nmAPI.BadRequest,e:
      session.session_sendmessage(socketobj, str(e)+"\nError")
      return
Esempio n. 10
0
def start_accepter():
  global accepter_thread

  # do this until we get the accepter started...
  while True:

    if not node_reset_config['reset_accepter'] and is_accepter_started():
      # we're done, return the name!
      return myname_port
    
    else:
      # If we came here because a reset was initiated, kill the old 
      # accepter thread server socket before starting a new one.
      try:
        accepter_thread.close_serversocket()
        servicelogger.log("Closed previous accepter thread server socket.")
      except:
        # There was no accepter_thread, or it couldn't .close_serversocket().
        # No problem -- this means nothing will be in the way of the new 
        # serversocket.
        pass


      # Use getmyip() to find the IP address the nodemanager should 
      # listen on for incoming connections. This will work correctly 
      # if IP/interface preferences have been set.
      # We only want to call getmyip() once rather than in the loop 
      # since this potentially avoids rebuilding the allowed IP 
      # cache for each possible port
      bind_ip = getmyip()

      # Attempt to have the nodemanager listen on an available port.
      # Once it is able to listen, create a new thread and pass it the socket.
      # That new thread will be responsible for handling all of the incoming connections.     
      for possibleport in configuration['ports']:
        try:
          # Use a Repy socket for listening. This lets us override 
          # the listenforconnection function with a version using an 
          # Affix stack easily; furthermore, we can transparently use 
          # the Repy sockettimeout library to protect against malicious 
          # clients that feed us endless data (or no data) to tie up 
          # the connection.
          try:
            serversocket = timeout_listenforconnection(bind_ip, possibleport, 10)
          except (AlreadyListeningError, DuplicateTupleError), e:
            # These are rather dull errors that will result in us 
            # trying a different port. Don't print a stack trace.
            servicelogger.log("[ERROR]: listenforconnection for address " + 
                bind_ip + ":" + str(possibleport) + " failed with error '" + 
                repr(e) + "'. Retrying.")
            continue

          # Assign the nodemanager name.
          # We re-retrieve our address using getmyip as we may now be using
          # a zenodotus name instead.
          myname_port = str(getmyip()) + ":" + str(possibleport)

          # If there is no error, we were able to successfully start listening.
          # Create the thread, and start it up!
          accepter = nmconnectionmanager.AccepterThread(serversocket)
          accepter.start()
          
          # Now that we created an accepter, let's use it!          
          set_accepter(accepter)

          # MOSHE: Is this thread safe!?          
          # Now that waitforconn has been called, unset the accepter reset flag
          node_reset_config['reset_accepter'] = False
        except Exception, e:
          # print bind_ip, port, e
          servicelogger.log("[ERROR] setting up nodemanager serversocket " + 
              "on address " + bind_ip + ":" + str(possibleport) + ": " + 
              repr(e))
          servicelogger.log_last_exception()
        else:
          break
Esempio n. 11
0
    def run(self):
        # Put everything in a try except block so that if badness happens, we can
        # log it before dying.
        try:
            while True:
                # remove stale items from the advertise dict.   This is important because
                # we're using membership in the dict to indicate a need to advertise
                clean_advertise_dict()

                # this list contains the keys we will advertise
                advertisekeylist = []

                # JAC: advertise under the node's key
                if rsa_publickey_to_string(
                        self.nodekey
                ) not in lastadvertisedict and self.nodekey not in advertisekeylist:
                    advertisekeylist.append(self.nodekey)

                # make a copy so there isn't an issue with a race
                for vesselname in self.addict.keys()[:]:

                    try:
                        thisentry = self.addict[vesselname].copy()
                    except KeyError:
                        # the entry must have been removed in the meantime.   Skip it!
                        continue

                    # if I advertise the vessel...
                    if thisentry['advertise']:
                        # add the owner key if not there already...
                        if rsa_publickey_to_string(
                                thisentry['ownerkey']
                        ) not in lastadvertisedict and thisentry[
                                'ownerkey'] not in advertisekeylist:
                            advertisekeylist.append(thisentry['ownerkey'])

                        # and all user keys if not there already
                        for userkey in thisentry['userkeys']:
                            if rsa_publickey_to_string(
                                    userkey
                            ) not in lastadvertisedict and userkey not in advertisekeylist:
                                advertisekeylist.append(userkey)

                # there should be no dups.
                assert (advertisekeylist == listops_uniq(advertisekeylist))

                # now that I know who to announce to, send messages to annouce my IP and
                # port to all keys I support
                for advertisekey in advertisekeylist:
                    try:
                        advertise_announce(advertisekey, str(myname), adTTL)
                        # mark when we advertise
                        lastadvertisedict[rsa_publickey_to_string(
                            advertisekey)] = getruntime()

                        # If the announce succeeded, and node was offline, log info message
                        # and switch it back to online mode.
                        if self.is_offline:
                            info_msg = 'Node is back online.'
                            if self.error_count:
                                info_msg += ' (Encountered ' + str(self.error_count) + \
                                              ' advertise errors)'
                            servicelogger.log('[INFO]: ' + info_msg)
                            self.error_count = 0
                            self.is_offline = False

                    except AdvertiseError, e:
                        # If all announce requests failed, assume node has
                        # gone offline,
                        if str(
                                e
                        ) == "None of the advertise services could be contacted":
                            self.is_offline = True
                            # Log an error message after every 'N' failures
                            if (self.error_count % error_skip_count == 0):
                                servicelogger.log(
                                    'AdvertiseError occured, continuing: ' +
                                    str(e))
                            self.error_count += 1
                        # Log all other types of errors
                        else:
                            servicelogger.log(
                                'AdvertiseError occured, continuing: ' +
                                str(e))
                    except Exception, e:
                        servicelogger.log_last_exception()
                        # an unexpected exception occured, exit and restart
                        return
Esempio n. 12
0
def start_accepter():
  global accepter_thread

  # do this until we get the accepter started...
  while True:

    if not node_reset_config['reset_accepter'] and is_accepter_started():
      # we're done, return the name!
      return myname_port
    
    else:
      # If we came here because a reset was initiated, kill the old 
      # accepter thread server socket before starting a new one.
      try:
        accepter_thread.close_serversocket()
        servicelogger.log("Closed previous accepter thread server socket.")
      except:
        # There was no accepter_thread, or it couldn't .close_serversocket().
        # No problem -- this means nothing will be in the way of the new 
        # serversocket.
        pass


      # Just use getmyip(), this is the default behavior and will work if we have preferences set
      # We only want to call getmyip() once, rather than in the loop since this potentially avoids
      # rebuilding the allowed IP cache for each possible port
      bind_ip = getmyip()

      # Attempt to have the nodemanager listen on an available port.
      # Once it is able to listen, create a new thread and pass it the socket.
      # That new thread will be responsible for handling all of the incoming connections.     
      for portindex in range(len(configuration['ports'])):
        possibleport = configuration['ports'][portindex]
        try:
          # There are two possible implementations available here:
          # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881
          # 2) Use a repy socket, but then possibly leak many connections.
      
          # For now, we'll use the second method and use the sockettimeout
          # library so we can still use a timeout to ensure we don't have
          # any malicious clients that feed us endless data (or no data)
          # to tie up the connection. Note that if we are using Affix,
          # we will be using a TimeoutAffix to achieve the equivalent
          # outcome.
          serversocket = timeout_listenforconnection(bind_ip, possibleport,10)

          # assign the nodemanager name.
          # We re-retrieve our address using getmyip as we may now be using
          # a zenodotus name instead.
          myname_port = str(getmyip()) + ":" + str(possibleport)

          # If there is no error, we were able to successfully start listening.
          # Create the thread, and start it up!
          accepter = nmconnectionmanager.AccepterThread(serversocket)
          accepter.start()
          
          # Now that we created an accepter, let's use it!          
          set_accepter(accepter)

          # MOSHE: Is this thread safe!?          
          # Now that waitforconn has been called, unset the accepter reset flag
          node_reset_config['reset_accepter'] = False
        except Exception, e:
          # print bind_ip, port, e
          servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e))
          servicelogger.log_last_exception()
        else:
          break

      else:
        servicelogger.log("[ERROR]: cannot find a port for recvmess")
Esempio n. 13
0
def handle_threading_error():
  """
  <Purpose>
    Handles a repy node failing with ThreadErr. If repy is allowed to use
    more than 10% of the current threads, reduce the global thread count by 50%
    and stop all existing vessels

  <Arguments>
    None
  
  <Exceptions>
    None

  <Side Effects>
    May re-write all resource files and stop all vessels

  <Returns>
    None
  """
  # Make a log of this
  servicelogger.log("[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step")

  # Get all the names of the vessels
  vesselnamelist = nmAPI.vesseldict.keys()
  
  # read in all of the resource files so that we can look at and possibly 
  # manipulate them.
  resourcedicts = {}
  for vesselname in vesselnamelist:
    resourcedicts[vesselname] = resourcemanipulation.read_resourcedict_from_file('resource.'+vesselname)
  
  # Get the number of threads Repy has allocated
  allowedthreadcount = 0
  for vesselname in vesselnamelist:
    allowedthreadcount = allowedthreadcount + resourcedicts[vesselname]['events']
  
  # Get the total number os system threads currently used 
  totalusedthreads = nonportable.os_api.get_system_thread_count()
  
  # Log this information
  servicelogger.log("[WARNING]:System Threads: "+str(totalusedthreads)+"  Repy Allocated Threads: "+str(allowedthreadcount))
  
  # Get the NM configuration
  configuration = persist.restore_object("nodeman.cfg")
  
  # Check if there is a threshold configuration,
  # otherwise add the default configuration
  if NOOP_CONFIG_KEY in configuration:
    threshold = configuration[NOOP_CONFIG_KEY]
  else:
    threshold = DEFAULT_NOOP_THRESHOLD
    configuration[NOOP_CONFIG_KEY] = threshold
    persist.commit_object(configuration, "nodeman.cfg")
  
  # Check if we are below the threshold, if so
  # then just return, this is a noop
  if allowedthreadcount < totalusedthreads * threshold:
    return
  
  servicelogger.log("[ERROR]:Reducing number of system threads!")



  #### We are above the threshold!   Let's cut everything by 1/2

  # First, update the resource files
  for vesselname in vesselnamelist:
    # cut the events by 1/2
    resourcedicts[vesselname]['events'] = resourcedicts[vesselname]['events'] / 2
    # write out the new resource files...
    resourcemanipulation.write_resourcedict_to_file(resourcedicts[vesselname], 'resource.'+vesselname)
  

  
  
  # Create the stop tuple, exit code 57 with an error message
  stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.")
  
  # Stop each vessel
  for vesselname in vesselnamelist:
    try:
      # Stop each vessel, using our stoptuple
      nmAPI.stopvessel(vesselname,stoptuple)
    except Exception, exp:
      # Forge on, regardless of errors
      servicelogger.log("[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: "+str(exp))
      servicelogger.log_last_exception()
Esempio n. 14
0
def start_accepter():
  global accepter_thread
  global affix_enabled
  global affix_stack_string

  # do this until we get the accepter started...
  while True:

    if not node_reset_config['reset_accepter'] and is_accepter_started():
      # we're done, return the name!
      return myname
    
    else:
      # Just use getmyip(), this is the default behavior and will work if we have preferences set
      # We only want to call getmyip() once, rather than in the loop since this potentially avoids
      # rebuilding the allowed IP cache for each possible port
      bind_ip = emulcomm.getmyip()
      
      # Attempt to have the nodemanager listen on an available port.
      # Once it is able to listen, create a new thread and pass it the socket.
      # That new thread will be responsible for handling all of the incoming connections.     
      for portindex in range(len(configuration['ports'])):
        possibleport = configuration['ports'][portindex]
        try:
          # There are two possible implementations available here:
          # 1) Use a raw (python) socket, and so we can have a timeout, as per ticket #881
          # 2) Use a repy socket, but then possibly leak many connections.
          
          # Check to see if AFFIX is enabled.
          try:
            affix_enabled_lookup = advertise_lookup(enable_affix_key)[-1]
            # Now we check if the last entry is True or False.
            if affix_enabled_lookup == 'True':
              affix_stack_string = advertise_lookup(affix_service_key)[-1]
              affix_enabled = True
            else:
              affix_enabled = False
          except AdvertiseError:
            affix_enabled = False
          except ValueError:
            affix_enabled = False
          except IndexError:
            # This will occur if the advertise server returns an empty list.
            affix_enabled = False

      
          # If AFFIX is enabled, then we use AFFIX to open up a tcpserversocket.
          if affix_enabled:
            # Here we are going to use a for loop to find a second available port
            # for us to use for the LegacyShim. Since the LegacyShim opens up two
            # tcpserversocket, it needs two available ports. The first for a normal
            # repy listenforconnection call, the second for shim enabled 
            # listenforconnection call.
            for shimportindex in range(portindex+1, len(configuration['ports'])):
              shimport = configuration['ports'][shimportindex]
              affix_legacy_string = "(LegacyShim," + str(shimport) + ",0)" + affix_stack_string
              affix_object = ShimStackInterface(affix_legacy_string)
              serversocket = affix_object.listenforconnection(bind_ip, possibleport)
              servicelogger.log("[INFO]Started accepter thread with Affix string: " + affix_legacy_string)
              break
            else:
              # This is the case if we weren't able to find any port to listen on
              # With the legacy shim.
              raise ShimError("Unable to create create tcpserversocket with shims using port:" + str(possibleport))

          else:
            # If AFFIX is not enabled, then we open up a normal tcpserversocket.
            # For now, we'll use the second method.
            serversocket = listenforconnection(bind_ip, possibleport)
          
          # If there is no error, we were able to successfully start listening.
          # Create the thread, and start it up!
          accepter = nmconnectionmanager.AccepterThread(serversocket)
          accepter.start()
          
          # Now that we created an accepter, let's use it!          
          set_accepter(accepter)

          # MOSHE: Is this thread safe!?          
          # Now that waitforconn has been called, unset the accepter reset flag
          node_reset_config['reset_accepter'] = False
        except Exception, e:
          # print bind_ip, port, e
          servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e))
          servicelogger.log_last_exception()
        else:
          # assign the nodemanager name
          myname = str(bind_ip) + ":" + str(possibleport)
          break

      else:
        servicelogger.log("[ERROR]: cannot find a port for recvmess")
Esempio n. 15
0
def handle_threading_error(nmAPI):
    """
  <Purpose>
    Handles a repy node failing with ThreadErr. Reduces global thread count by 50%.
    Restarts all existing vesselts

  <Arguments>
    nmAPI: the nmAPI module -- passed to the function to avoid import loops;
           see ticket #590 for more information about this.
  """
    # Make a log of this
    servicelogger.log(
        "[ERROR]:A Repy vessel has exited with ThreadErr status. Patching restrictions and reseting all vessels."
    )

    # Get the number of threads Repy has allocated
    allocatedThreads = get_allocated_threads()

    # Get the number os system threads currently
    systemThreads = nonportable.os_api.get_system_thread_count()

    # Log this information
    servicelogger.log(
        "[ERROR]:System Threads: " + str(systemThreads) + "  Repy Allocated Threads: " + str(allocatedThreads)
    )

    # Get the NM configuration
    configuration = persist.restore_object("nodeman.cfg")

    # Check if there is a threshold configuration,
    # otherwise add the default configuration
    if NOOP_CONFIG_KEY in configuration:
        threshold = configuration[NOOP_CONFIG_KEY]
    else:
        threshold = DEFAULT_NOOP_THRESHOLD
        configuration[NOOP_CONFIG_KEY] = threshold
        persist.commit_object(configuration, "nodeman.cfg")

    # Check if we are below the threshold, if so
    # then just return, this is a noop
    if allocatedThreads < systemThreads * threshold:
        return

    # We are continuing, so we are above the threshold!
    # First, update the restrictions
    update_restrictions()

    # Then, stop the vessels
    # Get all the vessels
    vessels = nmAPI.vesseldict.keys()

    # Create the stop tuple, exit code 57 with an error message
    stoptuple = (57, "Fatal system-wide threading error! Stopping all vessels.")

    # Stop each vessel
    for vessel in vessels:
        try:
            # Stop each vessel, using our stoptuple
            nmAPI.stopvessel(vessel, stoptuple)
        except Exception, exp:
            # Forge on, regardless of errors
            servicelogger.log("[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: " + str(exp))
            servicelogger.log_last_exception()
Esempio n. 16
0
  def run(self):
    # Put everything in a try except block so that if badness happens, we can
    # log it before dying.
    try:
      while True:
        # remove stale items from the advertise dict.   This is important because
        # we're using membership in the dict to indicate a need to advertise
        clean_advertise_dict()

        # this list contains the keys we will advertise
        advertisekeylist = []

        # JAC: advertise under the node's key
        if rsa_publickey_to_string(self.nodekey) not in lastadvertisedict and self.nodekey not in advertisekeylist:
          advertisekeylist.append(self.nodekey)


        # make a copy so there isn't an issue with a race
        for vesselname in self.addict.keys()[:]:

          try:
            thisentry = self.addict[vesselname].copy()
          except KeyError:
            # the entry must have been removed in the meantime.   Skip it!
            continue

          # if I advertise the vessel...
          if thisentry['advertise']:
            # add the owner key if not there already...
            if rsa_publickey_to_string(thisentry['ownerkey']) not in lastadvertisedict and thisentry['ownerkey'] not in advertisekeylist:
              advertisekeylist.append(thisentry['ownerkey'])

            # and all user keys if not there already
            for userkey in thisentry['userkeys']:
              if rsa_publickey_to_string(userkey) not in lastadvertisedict and userkey not in advertisekeylist:
                advertisekeylist.append(userkey)


        # there should be no dups.   
        assert(advertisekeylist == listops_uniq(advertisekeylist))

        # now that I know who to announce to, send messages to annouce my IP and 
        # port to all keys I support
        for advertisekey in advertisekeylist:
          try:
            advertise_announce(advertisekey, str(myname), adTTL)
            # mark when we advertise
            lastadvertisedict[rsa_publickey_to_string(advertisekey)] = getruntime()
         
            # If the announce succeeded, and node was offline, log info message
            # and switch it back to online mode.
            if self.is_offline:
              info_msg = 'Node is back online.'
              if self.error_count:
                info_msg += ' (Encountered ' + str(self.error_count) + \
                              ' advertise errors)'
              servicelogger.log('[INFO]: ' + info_msg)
              self.error_count = 0
              self.is_offline = False
          
          except AdvertiseError, e:
            # If all announce requests failed, assume node has
            # gone offline, 
            if str(e) == "None of the advertise services could be contacted":
              self.is_offline = True
              # Log an error message after every 'N' failures
              if (self.error_count % error_skip_count == 0):
                servicelogger.log('AdvertiseError occured, continuing: '+str(e))
              self.error_count += 1
            # Log all other types of errors
            else:
              servicelogger.log('AdvertiseError occured, continuing: '+str(e))
          except Exception, e:
            servicelogger.log_last_exception()
            # an unexpected exception occured, exit and restart
            return


    # handle the request as appropriate
    try:
      retstring = process_API_call(fullrequest)

    # Bad parameters, signatures, etc.
    except nmAPI.BadRequest,e:
      session.session_sendmessage(socketobj, str(e)+"\nError")
      return

    # Other exceptions only should happen on an internal error and should be
    # captured by servicelogger.log
    except Exception,e:
      servicelogger.log_last_exception()
      session.session_sendmessage(socketobj,"Internal Error\nError")
      return
 
    # send the output of the command...
    session.session_sendmessage(socketobj,retstring)

  except Exception, e:
    #JAC: Fix for the exception logging observed in #992
    if 'Socket closed' in str(e) or 'timed out!' in str(e):
      servicelogger.log('Connection abruptly closed in send')
      return
    else:
      raise
  
  finally:
Esempio n. 18
0
def start_accepter():
    global accepter_thread

    # do this until we get the accepter started...
    while True:

        if not node_reset_config['reset_accepter'] and is_accepter_started():
            # we're done, return the name!
            return myname_port

        else:
            # If we came here because a reset was initiated, kill the old
            # accepter thread server socket before starting a new one.
            try:
                accepter_thread.close_serversocket()
                servicelogger.log(
                    "Closed previous accepter thread server socket.")
            except:
                # There was no accepter_thread, or it couldn't .close_serversocket().
                # No problem -- this means nothing will be in the way of the new
                # serversocket.
                pass

            # Use getmyip() to find the IP address the nodemanager should
            # listen on for incoming connections. This will work correctly
            # if IP/interface preferences have been set.
            # We only want to call getmyip() once rather than in the loop
            # since this potentially avoids rebuilding the allowed IP
            # cache for each possible port
            bind_ip = getmyip()

            # Attempt to have the nodemanager listen on an available port.
            # Once it is able to listen, create a new thread and pass it the socket.
            # That new thread will be responsible for handling all of the incoming connections.
            for possibleport in configuration['ports']:
                try:
                    # Use a Repy socket for listening. This lets us override
                    # the listenforconnection function with a version using an
                    # Affix stack easily; furthermore, we can transparently use
                    # the Repy sockettimeout library to protect against malicious
                    # clients that feed us endless data (or no data) to tie up
                    # the connection.
                    try:
                        serversocket = timeout_listenforconnection(
                            bind_ip, possibleport, 10)
                    except (AlreadyListeningError, DuplicateTupleError), e:
                        # These are rather dull errors that will result in us
                        # trying a different port. Don't print a stack trace.
                        servicelogger.log(
                            "[ERROR]: listenforconnection for address " +
                            bind_ip + ":" + str(possibleport) +
                            " failed with error '" + repr(e) + "'. Retrying.")
                        continue

                    # Assign the nodemanager name.
                    # We re-retrieve our address using getmyip as we may now be using
                    # a zenodotus name instead.
                    myname_port = str(getmyip()) + ":" + str(possibleport)

                    # If there is no error, we were able to successfully start listening.
                    # Create the thread, and start it up!
                    accepter = nmconnectionmanager.AccepterThread(serversocket)
                    accepter.start()

                    # Now that we created an accepter, let's use it!
                    set_accepter(accepter)

                    # MOSHE: Is this thread safe!?
                    # Now that waitforconn has been called, unset the accepter reset flag
                    node_reset_config['reset_accepter'] = False
                except Exception, e:
                    # print bind_ip, port, e
                    servicelogger.log(
                        "[ERROR] setting up nodemanager serversocket " +
                        "on address " + bind_ip + ":" + str(possibleport) +
                        ": " + repr(e))
                    servicelogger.log_last_exception()
                else:
                    break
Esempio n. 19
0
    def run(self):
        try:
            while True:

                # the race condition here is that they might delete something and I will
                # check it.   This is okay.   I'll end up getting a KeyError when trying
                # to update the dictionary (checked below) or look at the old entry.
                for vesselname in self.statusdict.keys()[:]:

                    try:
                        statusfilename = self.statusdict[vesselname][
                            'statusfilename']
                        oldstatus = self.statusdict[vesselname]['status']
                    except KeyError:
                        # race condition, this was removed in the meantime.
                        continue

                    # there should be a status file (assuming we've inited)

                    try:
                        status, timestamp = statusstorage.read_status(
                            statusfilename)
                    except IOError, e:
                        # if the file exists, raise the exception since we don't know what
                        # it is about.
                        if e[0] != 2:
                            raise

                        # file not found.   This means it is fresh...
                        status = 'Fresh'
                        timestamp = time.time()

                    # Armon: Check if status is ThreadErr, this is a critical error condition
                    # that requires lowering the global thread count, and reseting all vessels
                    if status == "ThreadErr":
                        # Check if this is the first time for this timestamp
                        # Since the status file is not removed, this is necessary so that we do not
                        # continuously trigger the error handling code
                        if not timestamp in self.threadErrSet:
                            # Add the timestamp
                            self.threadErrSet.add(timestamp)

                            # Call the error handling module
                            nmthreadingerror.handle_threading_error(self.nmAPI)

                    # The status has a timestamp in case the process is killed harshly and
                    # needs to be restarted.   This allows ordering of status reports
                    staleness = time.time() - timestamp

                    if staleness < 0:
                        # time is running backwards, likely a NTP update (allow it)...
                        #            print "Time is running backwards by increment '"+str(staleness)+"', allowing this"
                        newstatus = status

                    elif staleness > updatebound:
                        # stale?
                        newstatus = oldstatus

                        if oldstatus == 'Started':

                            # BUG: What happens if we're wrong and it's alive?   What do we do?
                            # How do we detect and fix this safely?
                            newstatus = 'Stale'
                            # We set the timestamp so that our update happens in the table...
                            timestamp = time.time() - updatebound

                    else:
                        # it seems to be okay.   Use the given status
                        newstatus = status

                    update_status(self.statusdict, vesselname, newstatus,
                                  timestamp)

                time.sleep(self.sleeptime)

        except Exception, e:
            servicelogger.log_last_exception()
            raise e
  def run(self):
    try:
      while True:

        # the race condition here is that they might delete something and I will
        # check it.   This is okay.   I'll end up getting a KeyError when trying
        # to update the dictionary (checked below) or look at the old entry.
        for vesselname in self.statusdict.keys()[:]:

          try:
            statusfilename = self.statusdict[vesselname]['statusfilename']
            oldstatus = self.statusdict[vesselname]['status']
          except KeyError:
            # race condition, this was removed in the meantime.
            continue
  
  
          # there should be a status file (assuming we've inited)
  
          try: 
            status,timestamp = statusstorage.read_status(statusfilename)
          except IOError, e:
            # if the file exists, raise the exception since we don't know what
            # it is about.
            if e[0] != 2:
              raise

            # file not found.   This means it is fresh...
            status = 'Fresh'
            timestamp = time.time()
               
          
          # Armon: Check if status is ThreadErr, this is a critical error condition
          # that requires lowering the global thread count, and reseting all vessels
          if status == "ThreadErr":
            # Check if this is the first time for this timestamp
            # Since the status file is not removed, this is necessary so that we do not
            # continuously trigger the error handling code
            if not timestamp in self.threadErrSet:
              # Add the timestamp
              self.threadErrSet.add(timestamp)
              
              # Call the error handling module
              nmthreadingerror.handle_threading_error(self.nmAPI)
          
          # The status has a timestamp in case the process is killed harshly and 
          # needs to be restarted.   This allows ordering of status reports
          staleness = time.time() - timestamp
  
          if staleness < 0:
            # time is running backwards, likely a NTP update (allow it)...
#            print "Time is running backwards by increment '"+str(staleness)+"', allowing this"
            newstatus = status
         
          elif staleness > updatebound:  
            # stale?
            newstatus = oldstatus

            if oldstatus == 'Started':
  
              # BUG: What happens if we're wrong and it's alive?   What do we do?
              # How do we detect and fix this safely?
              newstatus = 'Stale'
              # We set the timestamp so that our update happens in the table...
              timestamp = time.time() - updatebound
  
          else:
            # it seems to be okay.   Use the given status
            newstatus = status
            
          update_status(self.statusdict, vesselname, newstatus, timestamp)
  
        time.sleep(self.sleeptime)
    
    except Exception,e:
      servicelogger.log_last_exception()
      raise e
Esempio n. 21
0
def handle_threading_error():
    """
  <Purpose>
    Handles a repy node failing with ThreadErr. If repy is allowed to use
    more than 10% of the current threads, reduce the global thread count by 50%
    and stop all existing vessels

  <Arguments>
    None
  
  <Exceptions>
    None

  <Side Effects>
    May re-write all resource files and stop all vessels

  <Returns>
    None
  """
    # Make a log of this
    servicelogger.log(
        "[ERROR]:A Repy vessel has exited with ThreadErr status. Checking to determine next step"
    )

    # Get all the names of the vessels
    vesselnamelist = nmAPI.vesseldict.keys()

    # read in all of the resource files so that we can look at and possibly
    # manipulate them.
    resourcedicts = {}
    for vesselname in vesselnamelist:
        resourcedicts[
            vesselname] = resourcemanipulation.read_resourcedict_from_file(
                'resource.' + vesselname)

    # Get the number of threads Repy has allocated
    allowedthreadcount = 0
    for vesselname in vesselnamelist:
        allowedthreadcount = allowedthreadcount + resourcedicts[vesselname][
            'events']

    # Get the total number os system threads currently used
    totalusedthreads = nonportable.os_api.get_system_thread_count()

    # Log this information
    servicelogger.log("[WARNING]:System Threads: " + str(totalusedthreads) +
                      "  Repy Allocated Threads: " + str(allowedthreadcount))

    # Get the NM configuration
    configuration = persist.restore_object("nodeman.cfg")

    # Check if there is a threshold configuration,
    # otherwise add the default configuration
    if NOOP_CONFIG_KEY in configuration:
        threshold = configuration[NOOP_CONFIG_KEY]
    else:
        threshold = DEFAULT_NOOP_THRESHOLD
        configuration[NOOP_CONFIG_KEY] = threshold
        persist.commit_object(configuration, "nodeman.cfg")

    # Check if we are below the threshold, if so
    # then just return, this is a noop
    if allowedthreadcount < totalusedthreads * threshold:
        return

    servicelogger.log("[ERROR]:Reducing number of system threads!")

    #### We are above the threshold!   Let's cut everything by 1/2

    # First, update the resource files
    for vesselname in vesselnamelist:
        # cut the events by 1/2
        resourcedicts[vesselname][
            'events'] = resourcedicts[vesselname]['events'] / 2
        # write out the new resource files...
        resourcemanipulation.write_resourcedict_to_file(
            resourcedicts[vesselname], 'resource.' + vesselname)

    # Create the stop tuple, exit code 57 with an error message
    stoptuple = (57,
                 "Fatal system-wide threading error! Stopping all vessels.")

    # Stop each vessel
    for vesselname in vesselnamelist:
        try:
            # Stop each vessel, using our stoptuple
            nmAPI.stopvessel(vesselname, stoptuple)
        except Exception, exp:
            # Forge on, regardless of errors
            servicelogger.log(
                "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: "
                + str(exp))
            servicelogger.log_last_exception()
Esempio n. 22
0
            myname_port = str(bind_ip) + ":" + str(possibleport)
          # If there is no error, we were able to successfully start listening.
          # Create the thread, and start it up!
          accepter = nmconnectionmanager.AccepterThread(serversocket)
          accepter.start()
          
          # Now that we created an accepter, let's use it!          
          set_accepter(accepter)

          # MOSHE: Is this thread safe!?          
          # Now that waitforconn has been called, unset the accepter reset flag
          node_reset_config['reset_accepter'] = False
        except Exception, e:
          # print bind_ip, port, e
          servicelogger.log("[ERROR]: when calling listenforconnection for the connection_handler: " + str(e))
          servicelogger.log_last_exception()
        else:
          break

      else:
        servicelogger.log("[ERROR]: cannot find a port for recvmess")

    # check infrequently
    time.sleep(configuration['pollfrequency'])
  





Esempio n. 23
0
def handle_threading_error(nmAPI):
    """
  <Purpose>
    Handles a repy node failing with ThreadErr. Reduces global thread count by 50%.
    Restarts all existing vesselts

  <Arguments>
    nmAPI: the nmAPI module -- passed to the function to avoid import loops;
           see ticket #590 for more information about this.
  """
    # Make a log of this
    servicelogger.log(
        "[ERROR]:A Repy vessel has exited with ThreadErr status. Patching restrictions and reseting all vessels."
    )

    # Get the number of threads Repy has allocated
    allocatedThreads = get_allocated_threads()

    # Get the number os system threads currently
    systemThreads = nonportable.os_api.get_system_thread_count()

    # Log this information
    servicelogger.log("[ERROR]:System Threads: " + str(systemThreads) +
                      "  Repy Allocated Threads: " + str(allocatedThreads))

    # Get the NM configuration
    configuration = persist.restore_object("nodeman.cfg")

    # Check if there is a threshold configuration,
    # otherwise add the default configuration
    if NOOP_CONFIG_KEY in configuration:
        threshold = configuration[NOOP_CONFIG_KEY]
    else:
        threshold = DEFAULT_NOOP_THRESHOLD
        configuration[NOOP_CONFIG_KEY] = threshold
        persist.commit_object(configuration, "nodeman.cfg")

    # Check if we are below the threshold, if so
    # then just return, this is a noop
    if allocatedThreads < systemThreads * threshold:
        return

    # We are continuing, so we are above the threshold!
    # First, update the restrictions
    update_restrictions()

    # Then, stop the vessels
    # Get all the vessels
    vessels = nmAPI.vesseldict.keys()

    # Create the stop tuple, exit code 57 with an error message
    stoptuple = (57,
                 "Fatal system-wide threading error! Stopping all vessels.")

    # Stop each vessel
    for vessel in vessels:
        try:
            # Stop each vessel, using our stoptuple
            nmAPI.stopvessel(vessel, stoptuple)
        except Exception, exp:
            # Forge on, regardless of errors
            servicelogger.log(
                "[ERROR]:Failed to reset vessel (Handling ThreadErr). Exception: "
                + str(exp))
            servicelogger.log_last_exception()