Exemple #1
0
class Node(Daemon):
   """
   Node is started up on the remote instance via the bootstrapping process for that instance.
   The node is responsible for tracking active streams and managing the workers that process
   the jobs from thosee streams.  If a stream goes idle (ie, there are no more jobs in the streams
   queue and all workers have died) then node will stop tracking the stream.  If jobs re-appear
   on the stream Node will spawn new workers to process those jobs.  If a new stream appears 
   Node will spawn new workers to processs the jobs on that stream.  Each worker is an independent
   concurrent process that inherits the stream to process from the Node.
   """

   def __init__(self, queue, qauthkey, mpps= 5, dfs= None, dauthkey= None, logdir= curdir, piddir= curdir, **properties):
      """Initialize the Node's I/O stream and connect to the Queue and/or DFS."""     

      self.id= getipaddress()
      self.queue= queue
      self.qauthkey= qauthkey
      self.mpps= mpps
      self.dfs= dfs
      self.dauthkey= dauthkey
      self.properties= properties
      self.shutdown= Value('i', 0)

      self.workers= {}
      self.alive= True
      self.start_time= datetime.utcnow()

      self.connect()
  
      super(Node, self).__init__(
         pidfile= path.join(piddir, self.__class__.__name__ + ".pid"),
         stdout= path.join(logdir, self.__class__.__name__ + ".out"),
         stderr= path.join(logdir, self.__class__.__name__ + ".err"),
         stdin= path.join(logdir, self.__class__.__name__ + ".in")
      )

   def connect(self):
      """Connects to the Queue and/or DFS on the host/port for whic hthe Node was intialized for."""

      self.qconnect()
      if None not in self.dfs:
         self.dconnect()

   def qconnect(self):
      """
      Attempts to connect to the Queue on the host/port for which the Node was initialized for.
      If no connection can be made, Node will keep attempting to connect until a connection
      can be established.  One connection is established the remove methods requested will be
      registered.
      """

      # remove connection from cache:
      # BaseProxy class has thread local storage which caches the connection
      # which is reused for future connections causing "borken pipe" errors on 
      # creating new manager.  
      if self.queue in BaseProxy._address_to_local:
         if hasattr(BaseProxy._address_to_local[self.queue][0], 'connection'):
            del BaseProxy._address_to_local[self.queue][0].connection

      # register handlers
      SyncManager.register("get_streams")
      SyncManager.register("get_queue")
      SyncManager.register("get_store")
      SyncManager.register("get_properties")

      print "connecting to queue", self.queue
      while self.alive:

         try:
            self.impq= SyncManager(address= self.queue, authkey= self.qauthkey)
            self.impq.connect() 
            print "connected to queue", self.queue
            break
         except (EOFError, IOError, SocketError) as e:
            print "could not connect ...trying again", str(e)
            sleep(1)

   def dconnect(self):
      """
      Attempts to connect to the DFS on the host/port for which the Node was initialized for.
      If no connection can be made, Node will keep attempting to connect until a connection
      can be established. Once a connection can be established the remove methods requested
      will be registered.
      """

      # remove connection from cache:
      # BaseProxy class has thread local storage which caches the connection
      # which is reused for future connections causing "borken pipe" errors on
      # creating new manager.
      if self.dfs in BaseProxy._address_to_local:
         if hasattr(BaseProxy._address_to_local[self.dfs][0], 'connection'):
            del BaseProxy._address_to_local[self.dfs][0].connection

      # register handlers
      SyncManager.register("get_nodes")

      print "connecting to dfs", self.dfs
      while self.alive:

         try:
            self.impd= SyncManager(address= self.dfs, authkey= self.dauthkey)
            self.impd.connect()
            print "connected to dfs", self.dfs
            break
         except (EOFError, IOError, SocketError) as e:
            print "could not connect ...trying again", str(e)
            sleep(1)

   def process(self):
      """
      Starts tracking streams. When a stream is found which matches the Node's criteria
      workers are assigned to the stream and spawned to start processing jobs from the
      streams queue. When the stream goes idle and all workers for that stream have died
      the Node will stop tracking the stream until new jobs appear on the stream. Node will
      limit the amount of workers it can spawn for a stream to the configred amount.  If Node was
      started with the --dfs option then status updates about how many streams, workers and jobs
      are being processed will continually be sent back to DFS via a configurable rate.
      """

      print "processing", self.mpps

      # get list of streams proxies
      streams= self.impq.get_streams()
      streams_tracking= {} 

      # if reporting to DFS 
      # track nodes via shared dict else maintain local dict
      if hasattr(self, 'impd'):
         nodes= self.impd.get_nodes()

      idle_time= datetime.utcnow()

      while self.alive:

         # get list of streams to track we are not currently tracking
         streams_to_track= filter(lambda stream_id: stream_id not in streams_tracking.keys(), streams.keys())

         # if we are only tracking streams with specific properties
         # TODO: need to think through this more
         """
         if len(self.properties):

            # get properties for all the streams we are tracking
            stream_properties= [dict(self.impq.get_properties(stream_id)) for stream_id in streams_to_track]

            # filter out streams we want to track based on matching subsets of properties
            if "id" in self.properties:
               streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(sp.items()).issubset(self.properties.items()), stream_properties))
            else:
               streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(filter(lambda (property_name, property_value): property_name != 'id', sp.items())).issubset(self.properties.items()), stream_properties))
         """

         for stream_id in streams_to_track:
            print "tracking stream", stream_id
            streams_tracking.update([(stream_id, (self.impq.get_queue(stream_id), self.impq.get_store(stream_id), self.properties))])

         # stop tracking streams which are no longer active
         for stream_id in streams_tracking.keys():
            if stream_id not in streams.keys():
               print 'stopped tracking stream', stream_id
               streams_tracking.pop(stream_id)

         # stop tracking workers which are no longer active
         for (pid, worker) in self.workers.items():
            if not worker.is_alive():
               #print "worker dead", pid, worker.stream_id
               self.workers.pop(pid)
            else:
               idle_time= datetime.utcnow()

         # create workers for streams we are currently tracking
         for (stream_id, (queue, store, properties)) in streams_tracking.items():

            qsize= queue.qsize()
            stream_workers= filter(lambda w: w.stream_id == stream_id, self.workers.values())
            num_stream_workers= min(qsize, self.mpps - len(stream_workers))
            if num_stream_workers:
               print "creating %s workers for %s" % (num_stream_workers, stream_id)
            for i in range(1, num_stream_workers + 1):
               worker= Worker(self.id, stream_id, queue, store, properties, self.shutdown)
               worker.start()
               self.workers.update([(worker.pid, worker)])
               idle_time= datetime.utcnow()
               print "created worker", i, worker.pid, stream_id

         status= Status(
            mpps= self.mpps,
            streams= len(streams_tracking),
            workers= len(self.workers),
            starttime= self.start_time,
            uptime= datetime.utcnow() - self.start_time,
            lastactivity= idle_time,
            idletime= datetime.utcnow() - idle_time,
            properties= self.properties,
            pid= getpid()
         )

         if hasattr(self, 'impd'):
            nodes.update([(self.id, status)])

         # if a worker has been blocked then stop all workers and shutdown
         # this will then cause the Node to go idle and DFS will shut it down
         # and restart a new node to take it's place
         if self.shutdown.value:
            print >> stderr, "node blocked", self.id
            self.alive= False

         sleep(1)

      self.stop()

   def stop(self):
      """
      Starts the shutdown process for the Node.  Waits for all
      workers to finish their activity. If Node was started with 
      the --dfs option then it will de-register itself with DFS.
      """

      # wait for workers to finish before shutting down
      print "shutting down node..."
      self.alive= False
      for (pid, worker) in self.workers.items():
         print "waiting for worker:", pid, worker.stream_id
         worker.join()

      # if reporting to DFS 
      # track nodes via shared dict else maintain local dict
      if hasattr(self, 'impd'):
         print "de-registering nodes with dfs"
         nodes= self.impd.get_nodes()
         try:
            del nodes[self.id]
         except KeyError:
            print >> stderr, "node not registered with dfs", self.id
 
      print "node shutdown complete."
      super(Node, self).stop()

   def run(self):
      """
      Starts processing the streams which match the given properties of the Node.
      If a connection error between Node and Queue/DFs occurs Node will continually
      try to re-establish connection.
      """

      while self.alive:
         try:
            self.process()
         except (KeyboardInterrupt, Exception) as e:
            if type(e) == KeyboardInterrupt:
               self.stop()
            else:
               print >> stderr, "queue/dfs communication error", str(e)
               self.connect()
         sleep(1)