Example #1
0
class Node(Daemon):
   """
   Node is started up on the remote instance via the bootstrapping process for that instance.
   The node is responsible for tracking active streams and managing the workers that process
   the jobs from thosee streams.  If a stream goes idle (ie, there are no more jobs in the streams
   queue and all workers have died) then node will stop tracking the stream.  If jobs re-appear
   on the stream Node will spawn new workers to process those jobs.  If a new stream appears 
   Node will spawn new workers to processs the jobs on that stream.  Each worker is an independent
   concurrent process that inherits the stream to process from the Node.
   """

   def __init__(self, queue, qauthkey, mpps= 5, dfs= None, dauthkey= None, logdir= curdir, piddir= curdir, **properties):
      """Initialize the Node's I/O stream and connect to the Queue and/or DFS."""     

      self.id= getipaddress()
      self.queue= queue
      self.qauthkey= qauthkey
      self.mpps= mpps
      self.dfs= dfs
      self.dauthkey= dauthkey
      self.properties= properties
      self.shutdown= Value('i', 0)

      self.workers= {}
      self.alive= True
      self.start_time= datetime.utcnow()

      self.connect()
  
      super(Node, self).__init__(
         pidfile= path.join(piddir, self.__class__.__name__ + ".pid"),
         stdout= path.join(logdir, self.__class__.__name__ + ".out"),
         stderr= path.join(logdir, self.__class__.__name__ + ".err"),
         stdin= path.join(logdir, self.__class__.__name__ + ".in")
      )

   def connect(self):
      """Connects to the Queue and/or DFS on the host/port for whic hthe Node was intialized for."""

      self.qconnect()
      if None not in self.dfs:
         self.dconnect()

   def qconnect(self):
      """
      Attempts to connect to the Queue on the host/port for which the Node was initialized for.
      If no connection can be made, Node will keep attempting to connect until a connection
      can be established.  One connection is established the remove methods requested will be
      registered.
      """

      # remove connection from cache:
      # BaseProxy class has thread local storage which caches the connection
      # which is reused for future connections causing "borken pipe" errors on 
      # creating new manager.  
      if self.queue in BaseProxy._address_to_local:
         if hasattr(BaseProxy._address_to_local[self.queue][0], 'connection'):
            del BaseProxy._address_to_local[self.queue][0].connection

      # register handlers
      SyncManager.register("get_streams")
      SyncManager.register("get_queue")
      SyncManager.register("get_store")
      SyncManager.register("get_properties")

      print "connecting to queue", self.queue
      while self.alive:

         try:
            self.impq= SyncManager(address= self.queue, authkey= self.qauthkey)
            self.impq.connect() 
            print "connected to queue", self.queue
            break
         except (EOFError, IOError, SocketError) as e:
            print "could not connect ...trying again", str(e)
            sleep(1)

   def dconnect(self):
      """
      Attempts to connect to the DFS on the host/port for which the Node was initialized for.
      If no connection can be made, Node will keep attempting to connect until a connection
      can be established. Once a connection can be established the remove methods requested
      will be registered.
      """

      # remove connection from cache:
      # BaseProxy class has thread local storage which caches the connection
      # which is reused for future connections causing "borken pipe" errors on
      # creating new manager.
      if self.dfs in BaseProxy._address_to_local:
         if hasattr(BaseProxy._address_to_local[self.dfs][0], 'connection'):
            del BaseProxy._address_to_local[self.dfs][0].connection

      # register handlers
      SyncManager.register("get_nodes")

      print "connecting to dfs", self.dfs
      while self.alive:

         try:
            self.impd= SyncManager(address= self.dfs, authkey= self.dauthkey)
            self.impd.connect()
            print "connected to dfs", self.dfs
            break
         except (EOFError, IOError, SocketError) as e:
            print "could not connect ...trying again", str(e)
            sleep(1)

   def process(self):
      """
      Starts tracking streams. When a stream is found which matches the Node's criteria
      workers are assigned to the stream and spawned to start processing jobs from the
      streams queue. When the stream goes idle and all workers for that stream have died
      the Node will stop tracking the stream until new jobs appear on the stream. Node will
      limit the amount of workers it can spawn for a stream to the configred amount.  If Node was
      started with the --dfs option then status updates about how many streams, workers and jobs
      are being processed will continually be sent back to DFS via a configurable rate.
      """

      print "processing", self.mpps

      # get list of streams proxies
      streams= self.impq.get_streams()
      streams_tracking= {} 

      # if reporting to DFS 
      # track nodes via shared dict else maintain local dict
      if hasattr(self, 'impd'):
         nodes= self.impd.get_nodes()

      idle_time= datetime.utcnow()

      while self.alive:

         # get list of streams to track we are not currently tracking
         streams_to_track= filter(lambda stream_id: stream_id not in streams_tracking.keys(), streams.keys())

         # if we are only tracking streams with specific properties
         # TODO: need to think through this more
         """
         if len(self.properties):

            # get properties for all the streams we are tracking
            stream_properties= [dict(self.impq.get_properties(stream_id)) for stream_id in streams_to_track]

            # filter out streams we want to track based on matching subsets of properties
            if "id" in self.properties:
               streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(sp.items()).issubset(self.properties.items()), stream_properties))
            else:
               streams_to_track= map(lambda sp: sp.get("id"), filter(lambda sp: set(filter(lambda (property_name, property_value): property_name != 'id', sp.items())).issubset(self.properties.items()), stream_properties))
         """

         for stream_id in streams_to_track:
            print "tracking stream", stream_id
            streams_tracking.update([(stream_id, (self.impq.get_queue(stream_id), self.impq.get_store(stream_id), self.properties))])

         # stop tracking streams which are no longer active
         for stream_id in streams_tracking.keys():
            if stream_id not in streams.keys():
               print 'stopped tracking stream', stream_id
               streams_tracking.pop(stream_id)

         # stop tracking workers which are no longer active
         for (pid, worker) in self.workers.items():
            if not worker.is_alive():
               #print "worker dead", pid, worker.stream_id
               self.workers.pop(pid)
            else:
               idle_time= datetime.utcnow()

         # create workers for streams we are currently tracking
         for (stream_id, (queue, store, properties)) in streams_tracking.items():

            qsize= queue.qsize()
            stream_workers= filter(lambda w: w.stream_id == stream_id, self.workers.values())
            num_stream_workers= min(qsize, self.mpps - len(stream_workers))
            if num_stream_workers:
               print "creating %s workers for %s" % (num_stream_workers, stream_id)
            for i in range(1, num_stream_workers + 1):
               worker= Worker(self.id, stream_id, queue, store, properties, self.shutdown)
               worker.start()
               self.workers.update([(worker.pid, worker)])
               idle_time= datetime.utcnow()
               print "created worker", i, worker.pid, stream_id

         status= Status(
            mpps= self.mpps,
            streams= len(streams_tracking),
            workers= len(self.workers),
            starttime= self.start_time,
            uptime= datetime.utcnow() - self.start_time,
            lastactivity= idle_time,
            idletime= datetime.utcnow() - idle_time,
            properties= self.properties,
            pid= getpid()
         )

         if hasattr(self, 'impd'):
            nodes.update([(self.id, status)])

         # if a worker has been blocked then stop all workers and shutdown
         # this will then cause the Node to go idle and DFS will shut it down
         # and restart a new node to take it's place
         if self.shutdown.value:
            print >> stderr, "node blocked", self.id
            self.alive= False

         sleep(1)

      self.stop()

   def stop(self):
      """
      Starts the shutdown process for the Node.  Waits for all
      workers to finish their activity. If Node was started with 
      the --dfs option then it will de-register itself with DFS.
      """

      # wait for workers to finish before shutting down
      print "shutting down node..."
      self.alive= False
      for (pid, worker) in self.workers.items():
         print "waiting for worker:", pid, worker.stream_id
         worker.join()

      # if reporting to DFS 
      # track nodes via shared dict else maintain local dict
      if hasattr(self, 'impd'):
         print "de-registering nodes with dfs"
         nodes= self.impd.get_nodes()
         try:
            del nodes[self.id]
         except KeyError:
            print >> stderr, "node not registered with dfs", self.id
 
      print "node shutdown complete."
      super(Node, self).stop()

   def run(self):
      """
      Starts processing the streams which match the given properties of the Node.
      If a connection error between Node and Queue/DFs occurs Node will continually
      try to re-establish connection.
      """

      while self.alive:
         try:
            self.process()
         except (KeyboardInterrupt, Exception) as e:
            if type(e) == KeyboardInterrupt:
               self.stop()
            else:
               print >> stderr, "queue/dfs communication error", str(e)
               self.connect()
         sleep(1)
Example #2
0
class Impetus(object):
   """
   Multi-threaded library for interfacing with the Impetus system. 
   Hides threading considerations from the client.  Determines callback
   methods through introspection if callbacks are not explicitly stated. 
   Decorators are provided for the client to indicate methods which run on 
   the remote nodes and local process methods which consume the results. 
   Creates a single stream per instance.  The client can created additional 
   streams through the Queue's remote methods via the "impq" handler. 
   """

   statuses= ("forked", "processed")

   def __init__(self, address, authkey, taskdir= "tasks", id= None, **properties):
      """Creates a stream and retrieves the streams priority queue and data-store."""

      self.id= id if id else str(uuid1())
      self.ipaddress= getipaddress()

      self.address= address
      self.taskdir= path.join(taskdir, self.id)
      self.properties= properties

      self.impq= SyncManager(address= self.address, authkey= authkey)
      self.impq.register("get_streams")
      self.impq.register("create_stream")
      self.impq.register("delete_stream")
      self.impq.register("get_store")
      self.impq.register("get_queue")
      self.impq.connect()

      self.jobs= []
      self.impq.create_stream(id= self.id, ipaddress= self.ipaddress, **properties)
      self.store= self.impq.get_store(id= self.id)
      self.queue= self.impq.get_queue(id= self.id)
      self.alive= True
      self._current_thread= None
      self._lock= Lock()
      self.threads= []
      self.errors= {}
      self.ready= {}
      self._progress= {}


      try:
         makedirs(self.taskdir)
      except:
         pass

   def __del__(self):
      """Deletes the stream that was created during initialization."""

      self.impq.delete_stream(self.id)

   @staticmethod
   def node(target):
      """
      All methods that are to run on remote nodes must be staticmethods
      as the context of which the methods was defined can not be serialized.
      """

      return target

   @staticmethod
   def startup(target):
      """
      Sets up the startup method for the object to run as a thread.
      """

      def _process(self):

         target(self)

      global _declaration_order
      _process.order= _declaration_order
      _declaration_order+= 1
      return _process

   @staticmethod
   def shutdown(target):
      """
      Sets up the shutdown method to be excuted 
      after all threads have been terminated.  The 
      ready and errors parameters will contain a dict 
      of file-handles pointing to the results files
      (ie, ../tasks/<task_id>/<method>.ok, .err>
      for each @process method.
      """
   
      def _shutdown(self):

         target(self, self.ready, self.errors, self._progress)

      global _declaration_order
      _shutdown.order= _declaration_order
      return _shutdown

   @staticmethod
   def process(target):
      """
      Sets up method to run as a thread. The method will be 
      called with a list of currently available jobs that are 
      either in a ready or error state. The thread will die
      when it has finished processing all the jobs the previous
      @process method forked and when the previous @process method
      has terminatted. Each thread will be regulated so that all 
      threads have an eventual chance of executing.  Order of execution
      is not guarenteed and thread scheudling is handled by the 
      operating system.
      """

      def _process(self):

         current_thread= currentThread()
         if current_thread.name == 'MainThread':
            return
         previous_thread= current_thread.previous_thread

         while self.alive:

            self._thread_regulator(current_thread, previous_thread)

            with self._lock:
               jobs= filter(lambda job: job.get("callback") == current_thread.name, self.store.values())
               ready= filter(lambda job: job.get("status") == "ready", jobs)
               errors= filter(lambda job: job.get("status") == "error", jobs)

               for job in ready:
                  self.ready[current_thread.name].write(encode(compress(jdumps(job, cls= JobEncoder))) + "\n")
                  self.store.pop(job.get("id"))

               for job in errors:
                  self.errors[current_thread.name].write(encode(compress(jdumps(job, cls= JobEncoder))) + "\n")
                  self.store.pop(job.get("id"))
        
            if len(ready) or len(errors):
               target(self, ready, errors)

            self._thread_progress(current_thread.name, "processed", len(ready) + len(errors))
            self._show_progress(current_thread)

            if len(self.store) == 0 and previous_thread != None and previous_thread.is_alive() == False:
               print "%s %s completed" % (datetime.utcnow(), current_thread.name)
               stdout.flush()
               self.alive= False

            sleep(0.01)
         
      global _declaration_order
      _process.order= _declaration_order
      _declaration_order+= 1
       
      return _process

   def fork(self, target, args, callback= None, priority= None, job_id= None, **properties):
      """
      Turns the target method to be forked into byte-code and creates a Job.  The Job
      is initialized to the starting state and placed the the streams priorty queue 
      for execution. 
      """
 
      if self.properties.get('mss'):

         stall_time= 1
         while len(self.store) > int(self.properties.get('mss')):
            print "throttling size %s exceeds mss %s" % (len(self.store), self.properties.get('mss'))
            sleep(stall_time)  
            stall_time+= 1
            if stall_time >= 10:
               break

      current_thread= currentThread()
      job= Job(
         client= {"id": self.id, "ipaddress": self.ipaddress},
         name= target.func_name,
         code= encode(compress(mdumps(target.func_code))),
         args= args,
         callback= callback if callback else current_thread.next_thread.name,
         result= None,
         transport= None,
         **properties
      )
      
      if priority:
         setattr(job, "priority", priority)

      self.store.update([(job.get("id"), job)])
      self.queue.put([(job.get("priority"), job.get("id"))])
  
      #print "forked", len(self.store)
      
      self.jobs.append(job.get("id"))
      
      self._thread_progress(current_thread.name, "forked", 1)
      
      return job.get("id")

   def _thread_progress(self, name, status, count):
      """
      Keeps track of how many jobs the current
      thread has forked/processed.
      """

      with self._lock:
 
         progress= self._progress.get(name, dict([(s, 0) for s in self.statuses]))
         progress.update([(status, progress.get(status, 0) + count)])
         self._progress.update([(name, progress)])

   def _show_progress(self, current_thread):
      """Displays the current threads progress to stdout."""
 
      msg= []
      with self._lock:
         for thread in self.threads:
            progress= self._progress.get(thread.name, dict([(s, 0) for s in self.statuses]))
            msg.append("%s %s/%s -> " % (thread.name, progress.get("forked"), progress.get("processed")))

      print "thread: %s via %s" % (''.join(msg)[:-4], current_thread.name)
         
   def _thread_regulator(self, current_thread, previous_thread):
      """
      Regulates the current thread so all threads have an eventual 
      chance to run.  Thread scheduling is handled by the operating-system. 
      If the operating-system repeatively schedules the same thread than
      that thread is immediately put to sleep so the operating-system
      can schedule a new thread.
      """

      stall_time= 1
      while self._current_thread == current_thread:
         #print "stalling:", current_thread.name, stall_time
         sleep(stall_time)
         stall_time+= 1
         if stall_time >= 10:
            break

         if current_thread.name == self.threads[-1].name and previous_thread != None and previous_thread.is_alive() == False:
            with self._lock:
               self._current_thread= self.threads[0]

      with self._lock:
         #print "setting current thread", current_thread.name
         self._current_thread= current_thread

   def _create_thread(self, name, method):
      """
      Creates thread for the @process method as well 
      as error/ready file handlers for which all jobs
      in an error/ready state are written to. All threads
      are maintained in an internal thread list.
      """

      thread= Thread(target= method, name= name, args= (self, ))
      self.errors[name]= open(path.join(self.taskdir, '.'.join((name, "err"))), 'ab+')
      self.ready[name]= open(path.join(self.taskdir, '.'.join((name, "ok"))), 'ab+')

      return thread
 
   def _link_threads(self, threads):
      """
      Creates previous/next properties for each thread based
      on the threads declaration order.
      """
 
      for i in range(len(threads)):
        setattr(threads[i], "previous_thread", threads[i-1] if i > 0 else None)
        setattr(threads[i], "next_thread", threads[i+1] if i < len(threads)-1 else None)
  
      return threads[0]

   def _start_threads(self, threads):
      """Starts all threads based on their delcaration order."""

      [thread.start() for thread in threads]
      [thread.join() for thread in threads]

   def run(self):

      self.threads= [self._create_thread(name, method) for (name, method) in sorted(filter(lambda (name, method): type(method) == FunctionType and method.__name__ == "_process", self.__class__.__dict__.items()), key= lambda (name, method): method.order)]