class PostGreSQLEngine(Component): """ PostGreSQL DB Engine """ implements(IDBEngine) db_user = ConfigItem('postgresqlengine', 'user', 'postgresql', """User name of the postgresql database.""") db_pass = ConfigItem('postgresqlengine', 'password', '', """Password of the postgresql database.""") db_host = ConfigItem('postgresqlengine', 'host', 'localhost', """Host of the postgresql database.""") db_port = ConfigItem('postgresqlengine', 'port', '5432', """Port of the postgresql database.""") db_name = ConfigItem('postgresqlengine', 'databasename', 'mydb', """Name of the postgresql database.""") db_echo = BoolItem('postgresqlengine', 'echo', True, """Use SQL Alchemy debug output.""") def __init__(self): """ Initialize engine """ self.engine = None def _get_connection_string(self): """ Private method to build the current connection string """ return _( "postgresql://%(user)s:%(password)s@%(host)s:%(port)s/%(dbname)s", user=self.db_user, password=self.db_pass, host=self.db_host, port=self.db_port, dbname=self.db_name) def get_engine(self): """ Create a PostGreSQL DB Engine """ if not self.engine: self.engine = create_engine(self._get_connection_string(), echo=self.db_echo) return self.engine def initdb(self): """ Called from the dbmanager once it gets initialized """ #if not self.get_engine(): # raise EngineCreationFailedError(_("Failed to create db engine for: %(connectionstring)s", connectionstring=self._get_connection_string())) def dbshutdown(self): """
class GenericTaskScheduleStrategy(Component): implements(ITaskScheduleStrategy) """ Implements a schedule strategy to select the next valid slave that should process a given task. We update the workers and rank them so that we give a slave with idle workers a better rating than a worker with pending work. - If rank > 0 => Slave has idle processes - If rank == 0 => Slave has currently the same number of tasks then processes - If rank < 0 => Slave has currently more tasks thank workers """ def setup(self, scheduler, master): self.scheduler = scheduler self.master = master def rate(self, slave): """ Rate a slave without any lock. Less if better. """ # Worst rating if we have a slave without workers! if slave.workers < 1: return sys.float_info.max # Basic rating is the task/worker ratio rating = max(0, slave.tasks / (slave.workers * 1.0)) # TODO: Add task finished per second ratio return rating def get_next_slave(self): """ Get the slave that should process the next task, get the one with better rating """ if self.master.node_registry: # Find the best score best_rating = sys.float_info.max best_node = None for node in self.master.node_registry: if self.master.node_registry[node] and self.master.node_registry[node].state == NodeState.active \ and self.master.node_registry[node].type == NodeType.slave \ and self.master.node_registry[node].rating < best_rating: best_rating = self.master.node_registry[node].rating best_node = node # Get all nodes with the same or similar score (some nice epsilon?) #Nodes ... #for node in self.master.node_registry: # if node and node.rating == best_rating: # best_rating = node.rating # Get a random slave form the first 10% of slaves. This will give us a bit # of randomness in case sending the tasks failes # For now we just use the best node if best_node: return best_node return None
class DNACurveNode(ApplicationNode): """ DNA Curve Analysis application """ implements(IApp) factor = IntItem('dnasample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(DNACurveNode, self).app_init() def app_main(self): """ Applications main entry """ return super(DNACurveNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.dna_system = DNACurveTaskSystem("ATGCAAATTG"*1000, "trifonov", name="Example", maxlen=1024*1024, factor=self.factor) return self.dna_system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ # Reassamble result to be processed further try: print("Total time: {}".format(time.time() - self.start_time)) except: traceback.print_exc() self.shutdown_main_loop() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class EnvSetup(Component): """ Component which will act as the setup manager for the environment. Also handles main backup/restore procedures if something hapens to the environment """ implements(IEnvUpgrader, IEnvBackup, IEnvDelete) # IEnvDelete methods def env_delete(self): """ Called when an env get's deleted, env is still valid """ self.log.info("(EnvSetup) Deleting Environment...") # IEnvBackup methods def env_backup(self): """ Called when we make a backup """ backup_data = {} self.log.info("(EnvSetup) Backup Environment...") return backup_data def env_restore(self): """ Called when we make a restore """ self.log.info("(EnvSetup) Restore Environment...") # IEnvUpgrader methods def env_created(self): """ Called when a new env has been created """ self.log.info("(EnvSetup) Created Environment...") def env_need_upgrade(self, dbManager): """ Called when we start an environment, if this call returns true the env will not able to load until we force an upgrade. TODO: This needs to be done! """ return False def env_do_upgrade(self, dbManager): """ This will perform the actual upgrade process. Be careful on using db transactions """ self.log.info("(EnvSetup) Uprade Environment...")
class DefaultApplication(Component): implements(IApp) def app_init(self): """ Initialize application just before running it """ self.log.info("Init Default Application..."); def app_main(self): """ Default Main implementation """ self.log.info("Starting Default Application..."); self.log.info("Closing Default Application..."); return APP_RET_CODE_SUCCESS
class MD5HashReverseNode(ApplicationNode): """ Reverse hash application """ implements(IApp) def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ ApplicationNode.app_init(self) def app_main(self): """ Applications main entry """ return ApplicationNode.app_main(self) def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() return MD5HashReverseTaskSystem(128) def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ print("Total time: {}".format(time.time() - self.start_time)) if not result[1]: if result[0]: self.log.info("Hash as been reversed. Initial number was %s" % str(result[0])) else: self.log.info("Failed to reverse the hash :(") else: self.log.error("Computation failed: %s" % str(result[1])) self.shutdown_main_loop() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework")
class AdvancedTaskManager(Component): implements(ITaskManager) """ Advanced task manager which """ # # Our async task update logic. This gets executed from our step # controller from the nodes main loop. # # The update loop is composed by: # - Generate task list # - Execute task for current stat using current delta time and step count # - Collect results and syncronize between systems (we still need to define what # a system is, just a component that implements the system ExtensionPoint) # def update(self): """ Update call from the frameworks main loop. """ pass def _pre_execute(self): """ Called before a step is exeuted """ pass def _post_execute(self): """ Called after a step has been performed """ pass def _execute(self): """ Execute a given step of the framework execution """ pass
class ApplicationNode(Component, Node): implements(IApp) """ An application node is a consumer node of the framework """ master_url = HostItem('applicationnode', 'master', 'localhost:8080', """This slave master node""") def app_init(self): """ Initialize application just before running it """ super(ApplicationNode, self).app_init() # Null this one first self.master_node_tcp = None def app_main(self): """ Launch a concurrent application """ result = super(ApplicationNode, self).app_main() if result not in SUCCESS_RET_CODES: return result # Flag used to re-create the master handshake if an unexpected connection drop # was detected self.unexected_connection_error = False self.is_registered = False # Make sure we clear the system out self.task_system = None # Enter app main loop self.main_loop() # Stop all processes and threads self.stop_app_node() self.stop_api_thread() # Now launch base node return result def stop_app_node(self): try: self.unregister_from_master() self.master_node_tcp.close() except: traceback.print_exc() self.log.warn("Failed to close TCP compute channel with master!") # --------------------------------------------------------------------- # Master Node Registration # --------------------------------------------------------------------- def master_disconnected(self, gracefully): """ Called when a master is disconnected (gracefully) or we had no response from the master itself (ungracefull) """ self.log.info("Master disconnected (gracefully:%s)" % (gracefully)) return True def get_master_url(self): """ Get the URL where our master node is hosted """ return "%s:%d" % (self.master_url) def get_master_address(self): """ Get the adress and port in (host,port) fashion """ return ('localhost', 8081) def has_master(self): """ Check if the node has a master or not. Master node has no master itself """ return True def generate_client_api(self): """ Generate the client API of our compute channel """ if self.master_node_tcp: @tcpremote(self.master_node_tcp_client) def work_finished(handler, request, result, task_system): self.work_finished(result, task_system) raise NoResponseRequired() @tcpremote(self.master_node_tcp_client) def task_finished(handler, request, task, result, error): self.task_finished(task, result, error) raise NoResponseRequired() @tcpremote(self.master_node_tcp_client) def push_tasksystem_failed(handler, request, result): self.push_tasksystem_failed(result) @tcpremote(self.master_node_tcp_client) def push_tasksystem_response(handler, request, result): self.push_tasksystem_response(result) @tcpremote(self.master_node_tcp_client) def push_task_failed(handler, request, result): self.push_task_failed(result) @tcpremote(self.master_node_tcp_client) def push_task_response(handler, request, result): self.push_task_response(result) @tcpremote(self.master_node_tcp_client) def push_tasks_failed(handler, request, result): self.push_tasks_failed(result) @tcpremote(self.master_node_tcp_client) def push_tasks_response(handler, request, result): self.push_tasks_response(result) @tcpremote(self.master_node_tcp_client) def register_client_failed(handler, request, result): self.register_client_failed(result) @tcpremote(self.master_node_tcp_client) def register_client_response(handler, request, result): self.register_client_response(result) def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ raise NotImplementedError("Node has not implemented work_finished!") def task_finished(self, task, result, error): """ Called when a task has been done """ raise NotImplementedError("Node has not implemented task_finished!") def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ raise NotImplementedError( "Node has not implemented push_tasksystem_response!") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ raise NotImplementedError( "Node has not implemented push_tasksystem_failed!") def register_client_failed(self, result): """ Called when we failed to register ouselfs to a master node. Raises an exception. """ raise FailedToRegisterWithMaster( "Client failed to register with the assigned master!") def push_task_response(self, result): """ We just add a Task to the computation framework """ raise NotImplementedError( "Node has not implemented push_task_response!") def push_task_failed(self, result): """ We failed to add a Task to the computation framework """ raise NotImplementedError("Node has not implemented push_task_failed!") def push_tasks_response(self, result): """ We just add a set of Tasks to the computation framework """ raise NotImplementedError( "Node has not implemented push_tasks_response!") def push_tasks_failed(self, result): """ We failed to add a set of Tasks to the computation framework """ raise NotImplementedError( "Node has not implemented push_tasks_failed!") def register_client_response(self, result): """ Called when we finsihed to register ouselfs to a master node. Raises an exception if the master rejected us. """ if not result: raise FailedToRegisterWithMaster( "Master rejected our registration attempt!") # Now that we are registered we can start sending the application to the master and start processing it, # if this is a re-register we hope that the computation has not yet been completed! if self.task_system is None: self._start_processing() def register_with_master(self): """ The node will register itself with the expected master node """ try: # Try to register with the master result = self.master_node.register_client(self.node_id_str, self.port, {}) self.is_registered = True # if the node ID is we are getting from the master is different we are re-registering if result['id'] == self.node_id_str: self.node_id = uuid.UUID(result['id']) self.node_id_str = str(self.node_id) # Now we try to connect through our compute channel self.master_node_tcp, self.master_node_tcp_client = self.create_tcp_proxy( self.master_url[0], result['port']) self.generate_client_api() # Now connect self.master_node_tcp.connect() # No register us with the compute channel befor the master makes a timeout self.master_node_tcp.register_client(self.node_id_str) except: pass def unregister_from_master(self): """ The node will unregister itself with the expected master node """ if self.node_id: try: self.master_node.unregister_client(self.node_id_str) except Exception as e: self.log.error("Exception when unregistering from master: %s" % str(e)) self.node_id = None def send_heartbeat(self): """ Send heartbeat to master in case we have one """ self.conditional_register_with_master() if self.node_id: try: self.is_registered = self.master_node.heartbeat_client( self.node_id_str) except: pass def conditional_register_with_master(self): """ Try to register with master after an unexpected connection failure """ if not self.is_registered: try: self.register_with_master() except: pass def rpc_call_failed(self, proxy, method, reason): """ Called when an RPC call failed for an unexpected reason """ self.log.debug("Method %s failed because of %s" % (method, reason)) # Handle network connection failures if urllib2.URLError == reason: self.unexected_connection_error = True self.is_registered = False def rpc_call_success(self, proxy, method, result): """ Called when an RPC call succeded """ self.log.debug("Method %s succeded with %s" % (method, result)) self.unexected_connection_error = False return result # --------------------------------------------------------------------- # ITaskSystem Handling # --------------------------------------------------------------------- def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ raise NotImplementedError("Node has not implemented get_task_system!") def start_processing(self): """ Called when the app is not using a ITaskSystem and will instead just add tasks and will take care of the task flow itself """ raise NotImplementedError("Node has not implemented start_processing!") def _start_processing(self): """ Called once the application is registered with the framework and we are ok to start our processing! """ # Request task system instance self.task_system = self.get_task_system() if self.task_system: # Make sure its an instance of ITaskSystem if not isinstance(self.task_system, ITaskSystem): raise NotImplementedError( 'TaskSystem "%s" not an instance of ITaskSystem' % str(self.task_system)) # Pickle and send! self.master_node_tcp.push_tasksystem(self.task_system) else: self.start_processing() def push_task(self, task): """ Send a task to the computation framework """ self.master_node_tcp.push_task(task) def push_tasks(self, tasks): """ Send a set of tasks to the computation framework """ self.master_node_tcp.push_tasks(tasks)
class ExpensiveNode(ApplicationNode): """ Application node distributing the computation of an expensive task """ implements(IApp) time_per_task = IntItem( 'expensivesample', 'time_per_task', 1, """Time each task will perform on doing nothind (active wait) to simulate an expensive computation""" ) num_tasks = IntItem('expensivesample', 'num_tasks', 8, """Number of tasks that must be performend""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(ExpensiveNode, self).app_init() def app_main(self): """ Applications main entry """ return super(ExpensiveNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.system = ExpensiveNodeTaskSystem(self.time_per_task, self.num_tasks) return self.system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ end_time = time.time() - self.start_time self.log.info("Total time: {}".format(end_time)) # Print expected single threaded time and improvement expected_time = self.time_per_task * self.num_tasks self.log.info("Plain python expected time: {}".format(expected_time)) self.log.info("Concurrent improvememnet: {}%".format( (expected_time / end_time) * 100.0)) self.shutdown_main_loop() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class GenericTaskScheduler(Component): implements(ITaskScheduler) """ Interface used by our distributed task scheduler. A scheduler receives an implemented system that will be executed on the distributed system through pickleing Python instances. """ strategy = ExtensionPointItem('generictaskscheduler', 'strategy', ITaskScheduleStrategy, 'GenericTaskScheduleStrategy', """Task schedulers used to schedule execution""") def __init__(self): Component.__init__(self) self.stats = Stats.getInstance() # Map that maps tasks and slaves to be able to resend the tasks if the slave was deleted from the system self.task_map = {} def setup(self, master): self.master = master self.lock = self.master.registry_lock # This is the global systems task queue. Every time we add tasks we will add them to this queue. # The global queue is where the current strategy will pickup tasks and decide which ones shall # be sent over a slave to be processed (this is getting done from a thread that waits for the queue) # If a new tasks gets added or a task gets completed we will notify the strategy which then decides to # pickup and process a new task. self.tasks = multiprocessing.JoinableQueue() # Schedule thread which will pickup processabel task and send them to a good slave self.schedule_thread = schedule_thread(self.log, self.tasks, self.handle_task) self.schedule_thread.start() # Do not pass the lock to the strategy, we have to ensure we handle locks for it self.strategy.setup(self, self.master) def stop(self): self.schedule_thread.stop() def _valid_id_no_lock(self, slave_id): """ Check if slave id is pointing to a valid slave without any lock """ return slave_id in self.master.node_registry and self._valid_slave_no_lock(self.master.node_registry[slave_id]) def _valid_slave_no_lock(self, slave): """ Check if a slave is valid without using any locks """ return slave and slave.type == NodeType.slave and slave.state == NodeState.active def rate_slaves(self): """ Update slaves """ with self.lock.writelock: start = time.time() for slave_id in self.master.node_registry: if self._valid_slave_no_lock(self.master.node_registry[slave_id]): self.master.node_registry[slave_id].rating = self.strategy.rate(self.master.node_registry[slave_id]) ellapsed = time.time() - start self.stats.add_avg("GenericTaskScheduleStrategy-rate-time",ellapsed) def start_system(self, task_system): """ Start an incomming task system """ self.push_tasks(task_system.generate_tasks(self.master)) def push_tasks(self, tasks): """ Push all tasks on the global task queue """ for task in tasks: self.push_task(task) def push_task(self, task): """ Put a task on the global task queue """ # Do not poison ourselfs! if task: self.tasks.put(task) def handle_task(self, task): """ Send a task to a slave or in case it failed queue the task back """ with self.lock.readlock: reschedule = True try: slave_id = self.strategy.get_next_slave() if slave_id: #TODO: Pickle task and send to slave task.slave_id = slave_id start = time.time() self.master.node_registry[task.slave_id].tcp_proxy.push_task(task) #print("Sending task: {} in {}".format(task.name, time.time() - start)) reschedule = False # Add task id to this slave so we could resend the task self._tasked_pushed(task.slave_id) except Exception as e: #self.log.error("Failed to send task to slave: %s. Queueing task again!" % str(e)) self.stats.add_avg("GenericTaskScheduler-task-send-failed") # Make sure we try it again! if reschedule: self.push_task(task) def _tasked_pushed(self, slave_id): """ A slave has aquired a new task, update its rank """ #with self.lock.readlock: if self._valid_id_no_lock(slave_id): self.master.node_registry[slave_id].tasks += 1 self.master.node_registry[slave_id].rating = self.strategy.rate(self.master.node_registry[slave_id]) #print("Push: {}".format(self.master.node_registry[slave_id].tasks)) def task_finished(self, task, result, error): """ A slave has finished a new task, update its rank """ task.finished(result, error) # Do not aquiere any write lock if the id is not valid! #with self.lock.readlock: if self._valid_id_no_lock(task.slave_id): self.master.node_registry[task.slave_id].tasks -= 1 self.master.node_registry[task.slave_id].rating = self.strategy.rate(self.master.node_registry[task.slave_id])
class GenericTaskManager(Component): implements(ITaskManager) """ Simple task manager used in simple single job applications """ num_workers = IntItem('GenericTaskManager', 'num_workers', -1, """Number of worker processed to be created, -1 will spawn as much as physical cores.""") def __init__(self, *args, **kwargs): Component.__init__(self, *args, **kwargs) # Initialize base manager stuff self._num_workers = 0 self.results = multiprocessing.JoinableQueue() def init(self, identity, address): """ Initialize the manager """ self.identity = identity self.host = address[0] self.port = address[1] self._num_workers = self.num_workers if self._num_workers <= 0: self._num_workers = multiprocessing.cpu_count() # We now prepare our queues, both the joinable and the results # queues. Then we just create a process for each worker self.tasks = multiprocessing.JoinableQueue() self.processes = [TaskProcess(self.results, i, self.tasks, self.identity, self.host, self.port) for i in range(self._num_workers)] #self.processes = [TaskProcess(self.results, i) for i in range(self._num_workers)] context = zmq.Context() self.ventilator_send = context.socket(zmq.PUSH) self.ventilator_send.bind("tcp://127.0.0.1:%d" % WORKER_PORT) def get_num_workers(self): """ Return the number of workers we use for our processing """ return self._num_workers def start(self): """ Start our worker processes """ for worker in self.processes: worker.daemon = True worker.start() def stop(self): """ Stop our worker processes """ for i in xrange(self._num_workers): #send_to_zmq_zipped(self.ventilator_send, None) print("Adding task") self.tasks.put(None) # Poison for result listener self.results.put(None) def update_pool(self, _num_workers=-1): """ Set the number of workers the task manager should use """ self.stop() self.init(_num_workers) self.start() def push_task(self, task): """ Push a task that should be completed by the workers """ try: #send_to_zmq_zipped(self.ventilator_send, task) self.tasks.put(task) except: traceback.print_exc() return True def wait_for_all(self): """ Wait until all tasks has been finished """ pass def get_results_queue(self): """ Return a refernce to the result queue """ return self.results def task_finished(self, task, result, error): """ Called once a task has been performed """ task.finished(result, error)
class MandlebrotSimpleNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using just tasks """ implements(IApp) use_optimized_task = BoolItem( 'mandlebrotsample', 'use_optimized_task', True, """Should we use the data optimized task or the lazy task""") send_task_batch = BoolItem( 'mandlebrotsample', 'task_batch', True, """Should we send all tasks one by one or should we batch them into a hughe list""" ) factor = IntItem( 'mandlebrotsample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""" ) iters = IntItem('mandlebrotsample', 'iters', 20, """Mandlebrot iterations per pixel""") height = IntItem('mandlebrotsample', 'height', 1024, """Height of the mandlebrot set image""") width = IntItem('mandlebrotsample', 'width', 1536, """Width of the mandlebrot set image""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(MandlebrotSimpleNode, self).app_init() def app_main(self): """ Applications main entry """ return super(MandlebrotSimpleNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ # Do not create a tasks system, we will handle tasks on our own return None def start_processing(self): """ Called when the app is not using a ITaskSystem and will instead just add tasks and will take care of the task flow itself """ self.log.info("Starting computation") if self.send_task_batch: self.log.info(" Task batching enabled") self.start_time = time.time() self.image = np.zeros((self.height, self.width), dtype=np.uint8) # Init task related stuff self.min_x = -2.0 self.max_x = 1.0 self.min_y = -1.0 self.max_y = 1.0 self.pixel_size_x = (self.max_x - self.min_x) / self.width self.pixel_size_y = (self.max_y - self.min_y) / self.height # Job handling (very optimistic :D) self.jobs = 0 self.finished_jobs = 0 job_list = [] workload = [] rows = 0 x = 0 if self.use_optimized_task: num_tasks, reminder = divmod(self.width, self.factor) self.jobs = num_tasks + reminder for i in xrange(0, self.jobs): if self.send_task_batch: job_list.append( MandlebrotTaskOptimized("m", None, self.node_id_str, iters=self.iters, start_x=i, rows=self.factor, cols=self.height, pixel_size_x=self.pixel_size_x, pixel_size_y=self.pixel_size_y, min_x=self.min_x, min_y=self.min_y)) else: self.push_task( MandlebrotTaskOptimized("m", None, self.node_id_str, iters=self.iters, start_x=i, rows=self.factor, cols=self.height, pixel_size_x=self.pixel_size_x, pixel_size_y=self.pixel_size_y, min_x=self.min_x, min_y=self.min_y)) else: for x in range(self.width): # Distribute using rows rows += 1 real = self.min_x + x * self.pixel_size_x for y in range(self.height): imag = self.min_y + y * self.pixel_size_y workload.append((x, y, real, imag, self.iters)) # every self.factor rows create a task with the workload. Note that in this case we will force the system_id to be None while setting the client id if rows == self.factor: if self.send_task_batch: job_list.append( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) else: self.push_task( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) self.jobs += 1 workload = [] rows = 0 # Add last task with rest of workload if len(workload) > 0: if self.send_task_batch: job_list.append( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) else: self.push_task( MandlebrotTask("mandle_{}".format(x), None, self.node_id_str, iters=self.iters, workload=workload)) self.jobs += 1 if self.send_task_batch: self.jobs = len(job_list) # Send batch or check for eventual end condition if self.send_task_batch: self.push_tasks(job_list) else: # Check in case we are already done! self.check_finished() def task_finished(self, task, result, error): """ Called when a task has been done """ # Integrate results in our image if result: for x, column in result.iteritems(): for y, value in column.iteritems(): self.image[y, x] = value self.finished_jobs += 1 self.check_finished() def check_finished(self): """ Check if we finsihed all computation or not """ if self.finished_jobs == self.jobs: self.log.info("All tasks finished!!") print("Calculated in {} seconds!".format(time.time() - self.start_time)) self.shutdown_main_loop() imshow(self.image) show() def push_task_response(self, result): """ We just add a Task to the computation framework """ pass #self.log.info("Task send to computation framework") def push_task_failed(self, result): """ We failed to add a Task to the computation framework """ self.log.info("Failed to send task send to computation framework") def push_tasks_response(self, result): """ We just add a set of Tasks to the computation framework """ self.log.info("Tasks send to computation framework") def push_tasks_failed(self, result): """ We failed to add a set of Tasks to the computation framework """ self.log.info("Failed to send tasks send to computation framework")
class MandlebrotNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using an autonomous task system """ implements(IApp) use_optimized_task = BoolItem( 'mandlebrotsample', 'use_optimized_task', True, """Should we use the data optimized task or the lazy task""") factor = IntItem( 'mandlebrotsample', 'factor', 1, """How many workloads does a single task get assigned, in our a workload is considered a row""" ) iters = IntItem('mandlebrotsample', 'iters', 20, """Mandlebrot iterations per pixel""") height = IntItem('mandlebrotsample', 'height', 1024, """Height of the mandlebrot set image""") width = IntItem('mandlebrotsample', 'width', 1536, """Width of the mandlebrot set image""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(MandlebrotNode, self).app_init() def app_main(self): """ Applications main entry """ return super(MandlebrotNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ self.start_time = time.time() self.system = MandlebrotTaskSystem(-2.0, 1.0, -1.0, 1.0, self.height, self.width, self.iters, self.factor, self.use_optimized_task) return self.system def work_finished(self, result, task_system): """ Called when the work has been done, the results is what our ITaskSystem sent back to us. Check resukt for more info """ print("Total time: {}".format(time.time() - self.start_time)) self.shutdown_main_loop() # Reassamble result to be processed further try: self.system.image = np.zeros((self.height, self.width), dtype=np.uint8) self.system.do_post_run(result) except: traceback.print_exc() def push_tasksystem_response(self, result): """ We just added a ITaskSystem on the framwork. Check result for more info """ self.log.info("Tasks system send to computation framework") def push_tasksystem_failed(self, result): """ We failed to push a ITaskSystem on the computation framework! """ self.log.error("Tasks system failed to be send to framework!") # Check if the resuklt dict contains a traceback if "t" in result: self.log.error(result["t"])
class ExpensiveSimpleNode(ApplicationNode): """ Application node distributing the computation of the mandlebrot set using just tasks """ implements(IApp) send_task_batch = BoolItem( 'expensivesample', 'task_batch', True, """Should we send all tasks one by one or should we batch them into a hughe list""" ) time_per_task = IntItem( 'expensivesample', 'time_per_task', 1, """Time each task will perform on doing nothind (active wait) to simulate an expensive computation""" ) num_tasks = IntItem('expensivesample', 'num_tasks', 8, """Number of tasks that must be performend""") def app_init(self): """ Called just before the main entry. Used as the initialization point instead of the ctor """ super(ExpensiveSimpleNode, self).app_init() def app_main(self): """ Applications main entry """ return super(ExpensiveSimpleNode, self).app_main() def get_task_system(self): """ Called from the base class when we are connected to a MasterNode and we are able to send computation tasks over """ # Do not create a tasks system, we will handle tasks on our own return None def start_processing(self): """ Called when the app is not using a ITaskSystem and will instead just add tasks and will take care of the task flow itself """ self.log.info("Starting computation") if self.send_task_batch: self.log.info(" Task batching enabled") self.start_time = time.time() self.finished_jobs = 0 if self.send_task_batch: self.push_tasks([ ExpensiveTask("expensive_{}".format(i), None, self.node_id_str, sleep_time=self.time_per_task) for i in range(self.num_tasks) ]) else: for i in range(self.num_tasks): self.push_task( ExpensiveTask("expensive_{}".format(i), None, self.node_id_str, sleep_time=self.time_per_task)) self.check_finished() def task_finished(self, task, result, error): """ Called when a task has been done """ self.finished_jobs += 1 self.check_finished() def check_finished(self): """ Check if we finsihed all computation or not """ self.log.info("%d -> %d" % (self.finished_jobs, self.num_tasks)) if self.finished_jobs == self.num_tasks: self.log.info("All tasks finished!!") end_time = time.time() - self.start_time self.log.info("Total time: {}".format(end_time)) # Print expected single threaded time and improvement expected_time = self.time_per_task * self.num_tasks self.log.info( "Plain python expected time: {}".format(expected_time)) self.log.info("Concurrent improvememnet: {}%".format( (expected_time / end_time) * 100.0)) self.shutdown_main_loop() def push_task_response(self, result): """ We just add a Task to the computation framework """ pass #self.log.info("Task send to computation framework") def push_task_failed(self, result): """ We failed to add a Task to the computation framework """ self.log.info("Failed to send task send to computation framework") def push_tasks_response(self, result): """ We just add a set of Tasks to the computation framework """ self.log.info("Tasks send to computation framework") def push_tasks_failed(self, result): """ We failed to add a set of Tasks to the computation framework """ self.log.info("Failed to send tasks send to computation framework")
class SlaveNode(Component, ComputeNode): implements(IApp) """ A slave is a remote worker node that receives jobs from a master works them out and then returns the result to the master. """ master_url = HostItem('slavenode', 'master', 'localhost:8080', """This slave master node""") def app_init(self): """ Initialize application just before running it """ super(SlaveNode, self).app_init() # Null this one first self.master_node_tcp = None def app_main(self): """ Launch a concurrent application """ result = super(SlaveNode, self).app_main() if result not in SUCCESS_RET_CODES: return result # Start computation try: self.setup_compute_node() except Exception: self.stop_compute_node() return APP_RET_CODE_FAILED # Flag used to re-create the master handshake if an unexpected connection drop # was detected self.unexected_connection_error = False self.is_registered = False # Enter app main loop self.main_loop() # Stop all processes and threads self.stop_compute_node() self.stop_api_thread() # Now launch base node return result def stop_compute_node(self): ComputeNode.stop_compute_node(self) try: self.unregister_from_master() self.master_node_tcp.close() except: traceback.print_exc() self.log.warn("Failed to close TCP compute channel with master!") # --------------------------------------------------------------------- # Master Node Registration # --------------------------------------------------------------------- def master_disconnected(self, gracefully): """ Called when a master is disconnected (gracefully) or we had no response from the master itself (ungracefull) """ self.log.info("Master disconnected (gracefully:%s)" % (gracefully)) return True def get_master_url(self): """ Get the URL where our master node is hosted """ return "%s:%d" % (self.master_url) def get_master_address(self): """ Get the adress and port in (host,port) fashion """ return ('localhost',8081) def has_master(self): """ Check if the node has a master or not. Master node has no master itself """ return True def generate_client_api(self): """ Generate the client API of our compute channel """ if self.master_node_tcp: @tcpremote(self.master_node_tcp_client) def push_task(handler, request, task): self.stats.add_avg('push_task') return self.push_task(task) @tcpremote(self.master_node_tcp_client) def register_slave_failed(handler, request, result): self.register_slave_failed(result) @tcpremote(self.master_node_tcp_client) def register_slave_response(handler, request, result): self.register_slave_response(result) def register_slave_failed(self, result): """ Called when we failed to register ouselfs to a master node. Raises an exception. """ raise FailedToRegisterWithMaster("Slave failed to register with the assigned master!") def register_slave_response(self, result): """ Called when we finsihed to register ouselfs to a master node. Raises an exception if the master rejected us. """ if not result: raise FailedToRegisterWithMaster("Master rejected our registration attempt!") def register_with_master(self): """ The node will register itself with the expected master node """ try: # Try to register with the master # TODO: Send all data the master requires to use the node best: # - Processor information: Amount, type, cache, ... # - RAM: Amount, speed, type, ECC? # - GPU: Type of cards, OpenCL, Cuda, amount, speed, memory, ... # - Net: Interface speed, connection speed, roundtrip, ... # - OS: Os type, previledges, ... result = self.master_node.register_slave(self.node_id_str, self.port, {'workers':self.get_num_workers()}) self.is_registered = True # if the node ID is we are getting from the master is different we are re-registering if result['id'] == self.node_id_str: self.node_id = uuid.UUID(result['id']) self.node_id_str = str(self.node_id) # Now we try to connect through our compute channel self.master_node_tcp, self.master_node_tcp_client = self.create_tcp_proxy(self.master_url[0], result['port']) self.generate_client_api() # Now connect self.master_node_tcp.connect() # No register us with the compute channel befor the master makes a timeout self.master_node_tcp.register_slave(self.node_id_str) except: traceback.print_exc() def unregister_from_master(self): """ The node will unregister itself with the expected master node """ if self.node_id: try: self.master_node.unregister_slave(self.node_id_str) self.master_node_tcp.close() except Exception as e: traceback.print_exc() self.log.error("Exception when unregistering from master: %s" % str(e)) self.node_id = None def send_heartbeat(self): """ Send heartbeat to master in case we have one """ self.conditional_register_with_master() if self.node_id: try: self.is_registered = self.master_node.heartbeat_slave(self.node_id_str) except: pass def conditional_register_with_master(self): """ Try to register with master after an unexpected connection failure """ if not self.is_registered: try: self.register_with_master() except: pass def rpc_call_failed(self, proxy, method, reason): """ Called when an RPC call failed for an unexpected reason """ self.log.debug("Method %s failed because of %s" % (method, reason)) # Handle network connection failures if urllib2.URLError == reason: self.unexected_connection_error = True self.is_registered = False def rpc_call_success(self, proxy, method, result): """ Called when an RPC call succeded """ self.log.debug("Method %s succeded with %s" % (method, result)) self.unexected_connection_error = False return result # --------------------------------------------------------------------- # Task Handling # --------------------------------------------------------------------- def task_finished(self, task, result, error): """ Called when a task has finished its computation, the result object contains the task, the result or an error and additional information """ try: self.master_node_tcp.task_finished(task, result, error) return True except: traceback.print_exc() return False
class ZMQTaskManager(Component, threading.Thread): implements(ITaskManager) """ Simple task manager used in simple single job applications """ num_workers = IntItem('ZMQTaskManager', 'num_workers', -1, """Number of worker processed to be created, -1 will spawn as much as physical cores.""") master_backend_port = HostItem('ZMQTaskManager', 'master_backend_port', 'localhost:5001', """Masters backend port where we will request tasks.""") def __init__(self): threading.Thread.__init__(self) Component.__init__(self) # Some thread related stuff self.daemon = True self.kill_switch = False # Create contect and socket self.context = zmq.Context() # Initialize base manager stuff self._num_workers = 0 self.results = multiprocessing.JoinableQueue() def init(self, identity, address): """ Initialize the manager """ self.identity = identity self.host = address[0] self.port = address[1] self._num_workers = self.num_workers if self._num_workers <= 0: self._num_workers = multiprocessing.cpu_count() # We now prepare our queues, both the joinable and the results # queues. Then we just create a process for each worker self.tasks = multiprocessing.JoinableQueue() self.processes = [TaskProcess(self.results, i, self.tasks, self.identity, self.host, self.port) for i in range(self._num_workers)] #self.processes = [TaskProcess(self.results, i) for i in range(self._num_workers)] context = zmq.Context() self.ventilator_send = context.socket(zmq.PUSH) self.ventilator_send.bind("tcp://127.0.0.1:%d" % WORKER_PORT) def get_num_workers(self): """ Return the number of workers we use for our processing """ return self._num_workers def start(self): """ Start our worker processes """ threading.Thread.start(self) for worker in self.processes: worker.daemon = True worker.start() def stop(self): """ Stop our worker processes """ self.log.info("Shutting down ZMQTaskManager") for i in xrange(self._num_workers): #send_to_zmq_zipped(self.ventilator_send, None) self.tasks.put(None) # Poison for result listener self.results.put(None) # Kill our own thread self.kill_switch = True self.context.term() self.join(5000) self.log.info("ZMQTaskManager shutdown finished") def run(self): self.log.info("ZMQTaskManager started") # Create and connect to our scheduler socket self.socket = self.context.socket(zmq.PULL) self.socket.setsockopt(zmq.LINGER, 0) self.socket.set_hwm(0) self.socket.connect('tcp://{host}:{port}'.format(host=self.master_backend_port[0], port=self.master_backend_port[1])) # Start receiving messages while not self.kill_switch: try: next_task = receive_from_zmq_zipped(self.socket) self.push_task(next_task) except zmq.ContextTerminated: break except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: pass # no message was ready else: break except: traceback.print_exc() self.socket.close() self.log.info("ZMQTaskManager stopped") def update_pool(self, _num_workers=-1): """ Set the number of workers the task manager should use """ self.stop() self.init(_num_workers) self.start() def push_task(self, task): """ Push a task that should be completed by the workers """ try: #send_to_zmq_zipped(self.ventilator_send, task) self.tasks.put(task) except: traceback.print_exc() return True def wait_for_all(self): """ Wait until all tasks has been finished """ pass def get_results_queue(self): """ Return a refernce to the result queue """ return self.results def task_finished(self, task, result, error): """ Called once a task has been performed """ task.finished(result, error)
class ZMQTaskScheduler(Component, threading.Thread): implements(ITaskScheduler) """ Different task scheduler implementation using ZMQ push/pull sockets. Uses a simple round-robin mechanism to handle multiple slaves. """ frontend_port = IntItem('ZMQTaskScheduler', 'frontend_port', 5000, """Frontend port used to send tasks to the scheduler""") backend_port = IntItem('ZMQTaskScheduler', 'backend_port', 5001, """Backend port used to send tasks to the scheduler. Slaves will receive tasks on it.""") def __init__(self): threading.Thread.__init__ (self) Component.__init__(self) self.stats = Stats.getInstance() # Some thread related stuff self.daemon = True self.kill_switch = False # The socket framework self.context = zmq.Context() self.frontend = self.context.socket(zmq.PULL) self.frontend.bind('tcp://*:{port}'.format(port=self.frontend_port)) self.frontend.setsockopt(zmq.LINGER, 0) self.frontend.set_hwm(0) self.backend = self.context.socket(zmq.PUSH) self.backend.bind('tcp://*:{port}'.format(port=self.backend_port)) self.backend.setsockopt(zmq.LINGER, 0) self.backend.set_hwm(0) # The poller is used to poll for incomming messages for both # the frontend (internet) and the backend (scheduling) self.poll = zmq.Poller() self.poll.register(self.frontend, zmq.POLLIN) # Connected socket locally to frontend to send tasks, this socket # provides a lock to be able to be thread-safe self.frontend_push = self.context.socket(zmq.PUSH) self.frontend_push.connect('tcp://localhost:{port}'.format(port=self.frontend_port)) self.frontend_push.setsockopt(zmq.LINGER, 0) self.frontend_push.set_hwm(0) # Our lock used to protect the frontend_push socket self.lock = threading.Lock() def setup(self, master): self.master = master self.start() def run(self): self.log.info("ZMQTaskScheduler started") # Start receiving messages while not self.kill_switch: try: sockets = dict(self.poll.poll(1000)) if self.frontend in sockets: msg = self.frontend.recv(flags=zmq.NOBLOCK) #tprint('Server received message from %s' % (ident)) self.backend.send(msg, flags=zmq.NOBLOCK) except zmq.Again: # Timeouy just fired, no problem! pass except KeyboardInterrupt: break except zmq.ContextTerminated: break except zmq.ZMQError as e: if e.errno == zmq.EAGAIN: pass # no message was ready else: break except: traceback.print_exc() # Not really good to just pass but saver for now! pass self.frontend.close() self.backend.close() with self.lock: self.frontend_push.close() self.context.term() self.log.info("ZMQTaskScheduler stopped") def stop(self): self.log.info("Shutting down ZMQTaskScheduler") self.kill_switch = True self.join(5000) self.log.info("ZMQTaskScheduler shutdown finished") def start_system(self, task_system): """ Start an incomming task system """ self.push_tasks(task_system.generate_tasks(self.master)) def _push_task(self, task): """ No lock variant of push task method """ send_to_zmq_zipped(self.frontend_push, task) def push_tasks(self, tasks): """ Push all tasks on the global task queue """ with self.lock: # DO NOT USE push_task to queue tasks! It would be a deadlock! for task in tasks: self._push_task(task) #self.tasks.put(task) def push_task(self, task): """ Put a task on the global task queue """ with self.lock: # Do not poison ourselfs! if task: self._push_task(task) #self.tasks.put(task) def rate_slaves(self): """ Update slaves """ pass def _tasked_pushed(self, slave_id): """ A slave has aquired a new task, update its rank """ pass def task_finished(self, task, result, error): """ A slave has finished a new task, update its rank """ task.finished(result, error)
class Pickler(Component): implements(IPickler) """ Class responsible for pickling and unpickling objects """ pickle_protocol = IntItem( 'pickler', 'protocol', pickle.HIGHEST_PROTOCOL, """Protocol used when pickling, by default pickle.HIGHEST_PROTOCOL""") secret = ConfigItem( 'pickler', 'secret', 'JhTv535Vg385V', """Default salt used on decrypting encrypting a pickle""") # salt size in bytes salt_size = IntItem('pickler', 'salt_size', 16, """Size of the salt used in the encryption process""") # number of iterations in the key generation num_iterations = IntItem( 'pickler', 'num_iterations', 20, """Number of iterations used in the key generation""") # the size multiple required for AES aes_padding = IntItem('pickler', 'aes_padding', 16, """Padding used for AES encryption""") def __init__(self): super(Pickler, self).__init__() self.crypto_helper = CryptoHelper(self.salt_size, self.num_iterations, self.aes_padding) if self.secret == Pickler.secret.default.decode('utf-8'): self.log.warn( "Pickler using default secret, please setup you own to avoid security vulnerabilities!" ) def pickle_f(self, fname, obj): """ picke an object into a file """ try: pickle.dump(obj=obj, file=gzip.open(fname, "wb"), protocol=self.pickle_protocol) except: raise PickleException() def unpickle_f(self, fname): """ Unpicke an object from a file """ try: return pickle.load(gzip.open(fname, "rb")) except: raise UnpickleException() def pickle_s(self, obj): """ pickle an object and return the pickled string """ try: return pickle.dumps(obj, protocol=self.pickle_protocol) except: raise PickleException() def pickle_encode_s(self, obj): """ Encode a pickled object """ try: return base64.b64encode( self.crypto_helper.encrypt(self.pickle_s(obj), self.secret)) except: raise PickleException() def unpickle_s(self, pickle_string): """ unpickle a string and return an object """ try: return pickle.loads(pickle_string) except: raise UnpickleException() def unpickle_decode_s(self, pickle_string): """ Unpickle a base64 string and return an object """ try: return self.unpickle_s( self.crypto_helper.decrypt(base64.b64decode(pickle_string), self.secret)) except: raise UnpickleException()
class MasterNode(Component, BaseNode): implements(IApp) """ A MasterNode is a compute node that can act and be used in computation when in standalone mode but is mainly used to dsitribute jobs along registered slaves. Once the jobs of a slave, or its own, are finished we will redistribute the results to the responsible client nodes. """ is_standalone = BoolItem('masternode', 'is_standalone', 'False', """Master node is also a slave and a standalone application""") inactivity_time_multiplier = IntItem('node', 'inactivity_time_multiplier', 3, """Inactivty multiplier multiplies the heartbeat time to ensure inactivity is always several heartbeats""") registry_mirror_timer = FloatItem('masternode', 'registry_mirror_timer', 30.0, """Timer used to update node registry mirror""") registry_cleanup_timer = FloatItem('masternode', 'registry_cleanup_timer', 60.0, """Timer used to cleanup the node registry""") task_scheduler= ExtensionPointItem('masternode', 'task_scheduler', ITaskScheduler, 'GenericTaskScheduler', """Task scheduler used by the master node""") master_port = IntItem('node', 'master_port', 8081, """Port used by the master node for high-performance communication and dedicated persistent connections""") def app_init(self): """ Initialize application just before running it """ super(MasterNode, self).app_init() # Start our TCPServer, #self.server = TCPServer("localhost", self.master_port, self) #self.server_thread = threading.Thread(name="tcp_server", target=self.server.serve_forever) #self.server_thread.daemon = True # Setup our ZeroMQ asyn server self.zmq_server = TCPServerZMQ(self.master_port, self.log, 5) # The node registry holds updated into about slaves/clients and its processing # we week track of number of tasks submitted to each slave, how they perform # general statistics and more. self.node_registry = defaultdict(self._default_node) self.registry_lock = self.lock_cache.registry_lock self.node_cleanup_threshold = self.registry_cleanup_timer self.task_scheduler.setup(self) # Our client registry self.client_registry = defaultdict(self._default_node) self.client_registry_lock = self.lock_cache.client_registry_lock # The registry mirror is used to send all updates from time to time and cache it. # We use a different dict so client status request do not block self.node_registry_mirror = {} self.registry_mirror_lock = self.lock_cache.registry_mirror_lock self.registry_mirror_threshold = self.registry_mirror_timer self.registry_mirror_dirty = True # Client registry mirror self.client_registry_mirror = {} self.client_registry_mirror_lock = self.lock_cache.client_registry_mirror_lock # Timer which controls inactivity handling of a node, being it a slave or a client self.inactivity_timer = self.heartbeat_timer*self.inactivity_time_multiplier self.inactivity_unregister_timer = self.inactivity_timer * 3 self.inactivity_threshold = self.inactivity_timer self.test_timer = 1 self.test_app_id = uuid.uuid1() # Our task system registry self.tasksystem_registry = defaultdict(self._default_tasksystem) self.tasksystem_lock = self.lock_cache.tasksystem_lock # Create master thread #self.master_thread = master_thread(self.log) def app_main(self): """ Launch a concurrent application """ self.log.info("Initializing MasterNode") result = super(MasterNode, self).app_main() if result not in SUCCESS_RET_CODES: return result # Start the main server thread #self.server_thread.start() self.zmq_server.start() # Enter mail loop self.main_loop() # Stop all threads processes #self.server.shutdown() self.zmq_server.stop() self.notify_shutdown() self.stop_api_thread() #self.stop_master_thread() self.task_scheduler.stop() # Now launch base node return result def handle_echo(self, sock, address): print(address) fp = sock.makefile() while True: line = fp.readline() if line: fp.write(line) fp.flush() else: break def stop_master_thread(self): self.master_thread.stop() def generate_api(self): """ Create all rpc methods the node requires """ super(MasterNode, self).generate_api() if not self.is_standalone: @jsonremote(self.api_service_v1) def register_slave(request, node_id, port, data): self.stats.add_avg('register_slave') return self.register_node(node_id, web.ctx['ip'], port, data, NodeType.slave) @tcpremote(self.zmq_server, name='register_slave') #@tcpremote(self.server, name='register_slave') def register_slave_tcp(handler, request, node_id): self.stats.add_avg('register_slave_tcp') return self.register_node_tcp(handler, request, node_id, NodeType.slave) @jsonremote(self.api_service_v1) def register_client(request, node_id, port, data): self.stats.add_avg('register_client') return self.register_node(node_id, web.ctx['ip'], port, data, NodeType.client) @tcpremote(self.zmq_server, name='register_client') #@tcpremote(self.server, name='register_client') def register_client_tcp(handler, request, node_id): self.stats.add_avg('register_client_tcp') return self.register_node_tcp(handler, request, node_id, NodeType.client) @jsonremote(self.api_service_v1) def unregister_slave(request, node_id): self.stats.add_avg('unregister_slave') return self.unregister_node(node_id, NodeType.slave) @jsonremote(self.api_service_v1) def unregister_client(request, node_id): self.stats.add_avg('unregister_client') return self.unregister_node(node_id, NodeType.client) @jsonremote(self.api_service_v1) def heartbeat_slave(request, node_id): self.stats.add_avg('heartbeat_slave') return self.heartbeat(node_id, NodeType.slave) @jsonremote(self.api_service_v1) def heartbeat_client(request, node_id): self.stats.add_avg('heartbeat_client') return self.heartbeat(node_id, NodeType.client) @tcpremote(self.zmq_server) #@tcpremote(self.server) def task_finished(handler, request, task, result, error): self.stats.add_avg('task_finished') self.task_finished(task, result, error) # This is an end method for the interaction raise NoResponseRequired() @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task_response(handler, request, result): # TODO: Handle failure when result is False! pass @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task_failed(handler, request, result): # TODO: Handle failure when pushing tasks failed! pass @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_tasksystem(handler, request, tasksystem): """ Push a application onto the computation framework """ self.stats.add_avg('push_tasksystem') return self.push_tasksystem(request, tasksystem) @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_task(handler, request, task): """ Push a task onto the computation framework """ self.stats.add_avg('push_task') return self.push_task(request, task) @tcpremote(self.zmq_server) #@tcpremote(self.server) def push_tasks(handler, request, tasks): """ Push a set of tasks onto the computation framework """ self.stats.add_avg('push_tasks') if isinstance(tasks, list): for task in tasks: if not self.push_task(request, task): return False return True @tcpremote(self.zmq_server) #@tcpremote(self.server) def test_method(handler, request): print("test_method from {}".format(request)) raise NoResponseRequired() def _generate_status_dict(self, node): return {'type':node.type,'state':node.state} def status(self): status = ComputeNode.status(self) with self.registry_mirror_lock.readlock: status['nodes'] = dict((k, self._generate_status_dict(v)) for k, v in self.node_registry_mirror.iteritems() if v) with self.client_registry_mirror_lock.readlock: status['clients'] = dict((k, self._generate_status_dict(v)) for k, v in self.client_registry_mirror.iteritems() if v) return status def on_update(self, delta_time): super(MasterNode, self).on_update(delta_time) # Update map self.registry_mirror_threshold -= delta_time if self.registry_mirror_threshold < 0: self.update_registry_mirror() self.registry_mirror_threshold = self.registry_mirror_timer # Handle inactive nodes or cleanup empty nodes self.inactivity_threshold -= delta_time self.node_cleanup_threshold -= delta_time if self.inactivity_threshold < 0: self.update_inactive_nodes() self.inactivity_threshold = self.inactivity_timer elif self.node_cleanup_threshold < 0: self.clean_node_map() self.node_cleanup_threshold = self.registry_cleanup_timer def has_master(self): """ Check if the node has a master or not. Master node has no master itself """ return False def _handle_timeout(self, node): """ Handle state for a given node checking the nodes timestamp value """ ellapsed_time = self.current_time - node['heartbeat'] if node['state'] == NodeState.active and ellapsed_time > self.inactivity_timer: self.log.info("Node %s set to inactive (t:%f)" % (node['node_id'], ellapsed_time)) node['state'] = NodeState.inactive self.set_registry_dirty() elif node['state'] == NodeState.inactive and ellapsed_time > self.inactivity_unregister_timer: # Delete node! To much time inactive! self.log.info("Node %s kicked from system! To much time of inactivity! (t:%f)" % (node['node_id'], ellapsed_time)) self.set_registry_dirty() return None return node def set_registry_dirty(self): """ Set the registry dirty, this will force an update of the task scheduler """ self.registry_mirror_dirty = True self.update_scheduler() def update_scheduler(self): """ Update task scheduler with the current list of slaves """ self.task_scheduler.rate_slaves() def update_inactive_nodes(self): """ Called when we check for inactive nodes, those that have not send any heartbeat for a while """ self.log.info("Checking for inactive nodes...") with self.registry_lock.writelock: self.node_registry = dict((k, self._handle_timeout(v)) for k, v in self.node_registry.iteritems() if v) with self.client_registry_lock.writelock: self.client_registry = dict((k, self._handle_timeout(v)) for k, v in self.client_registry.iteritems() if v) def update_registry_mirror(self): """ Update the registry mirror with a copy of the registry. Used to expose a copy dict to the public. """ if self.registry_mirror_dirty: self.log.info("Updating node registry mirror...") with self.registry_mirror_lock.writelock: self.node_registry_mirror = dict((k, v) for k, v in self.node_registry.iteritems() if v) with self.client_registry_mirror_lock.writelock: self.client_registry_mirror = dict((k, v) for k, v in self.client_registry.iteritems() if v) self.registry_mirror_dirty = False def clean_node_map(self): """ Clean node map for any empty node values. """ self.log.info("Cleaning node registry...") with self.registry_lock.writelock: self.node_registry = dict((k, v) for k, v in self.node_registry.iteritems() if v) with self.client_registry_lock.writelock: self.client_registry = dict((k, v) for k, v in self.client_registry.iteritems() if v) def get_node_id_no_lock(self, url): return next((k for k, v in self.node_registry.iteritems() if v and v.url == url), None) def get_node_id(self, url): """ Return a node id given an url """ with self.registry_lock.readlock: node_id = self.get_node_id_no_lock(url) return node_id def get_client_id_no_lock(self, url): return next((k for k, v in self.client_registry.iteritems() if v and v.url == url), None) def get_client_id(self, url): """ Return a client id given an url """ with self.client_registry_lock.readlock: node_id = self.get_client_id_no_lock(url) return node_id def get_node(self, url): """ Get a node representation given an url """ node = None with self.registry_lock.readlock: node_id = self.get_node_id_no_lock(url) if node_id: node = self.node_registry[node_id] return node def get_client(self, url): """ Get a node representation given an url """ node = None with self.registry_lock.readlock: node_id = self.get_client_id_no_lock(url) if node_id: node = self.node_registry[node_id] return node def _default_node(self): return {} def _default_tasksystem(self): return Bunch({}) def _default_slave_bunch(self): return Bunch({'node_id':'', 'url':'', 'ip':'', 'port':0, 'type':NodeType.slave, 'state':NodeState.inactive, 'heartbeat':0, 'proxy':None, 'workers':0, 'tasks':0, 'rating':0.0, 'handler': None}) def _default_client_bunch(self): return Bunch({'node_id':'', 'url':'', 'ip':'', 'port':0, 'type':NodeType.slave, 'state':NodeState.inactive, 'heartbeat':0, 'proxy':None, 'handler': None}) def register_node(self, node_id, ip, port, data, node_type): """ Register a node within our node map """ try: # TODO: CHECK ALL CLIENT DATA! url = ("%s:%d") % (ip, port) if NodeType.slave == node_type: with self.registry_lock.writelock: node = self.get_node(url) if node is None: # This is a node that is registering again so reuse it node = self.node_registry[node_id] = self._default_slave_bunch() # Basic node values node.node_id = node_id node.url = url node.ip = ip node.port = port node.type = node_type node.proxy = self.create_node_proxy(url) node.state = NodeState.pending node.heartbeat = time.time() # Add slave data node.workers = data['workers'] node.tasks = 0 # Rating goes from [0, ..) 0 is the best rating and so asuitable candidate node.rating = 0 node.handler = None node.tcp_proxy = None # Make sure the mirror updates properly self.set_registry_dirty() # Send back the generated id return {'id': node.node_id, 'port': self.master_port} elif NodeType.client == node_type: with self.client_registry_lock.writelock: node = self.get_node(url) if node is None: # This is a node that is registering again so reuse it node = self.client_registry[node_id] = self._default_client_bunch() # Basic node values node.node_id = node_id node.url = url node.ip = ip node.port = port node.type = node_type node.proxy = self.create_node_proxy(url) node.state = NodeState.pending node.heartbeat = time.time() # Add client data node.handler = None node.tcp_proxy = None # Make sure the mirror updates properly self.set_registry_dirty() # Send back the generated id return {'id': node.node_id, 'port': self.master_port} else: raise NotImplementedError("Unkown node") except Exception as e: traceback.print_exc() # Make sure to cleanup node from node map! if node_id: self.unregister_node(node_id, node_type) raise e def unregister_node(self, node_id, node_type): """ Unregister a node within our node map """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: self.node_registry[node_id] = None # Make sure we let the mirror update self.registry_mirror_dirty = True self.set_registry_dirty() return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: # if we had a socket close it now! self.client_registry[node_id] = None # Get rid of any registered task system with self.tasksystem_lock.writelock: if node_id in self.tasksystem_registry: del self.tasksystem_registry[node_id] # Make sure we let the mirror update self.registry_mirror_dirty = True self.set_registry_dirty() return True return False else: raise NotImplementedError("Unkown node") def register_node_tcp(self, handler, request, node_id, node_type): """ Slave has just registered itself throug the compute channel """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: # The handler is shared between many client sockets! self.node_registry[node_id].handler = handler self.node_registry[node_id].socket = handler.worker #self.node_registry[node_id].tcp_proxy = self.create_tcp_client_proxy(handler.worker, request) self.node_registry[node_id].tcp_proxy = self.create_tcp_client_proxy_zmq(self.zmq_server.context, request) self.node_registry[node_id].state = NodeState.active # Let the slave know that the handshake worked return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: # The handler is shared between many client sockets! self.client_registry[node_id].handler = handler self.client_registry[node_id].socket = handler.worker #self.client_registry[node_id].tcp_proxy = self.create_tcp_client_proxy(handler.worker, request) self.client_registry[node_id].tcp_proxy = self.create_tcp_client_proxy_zmq(self.zmq_server.context, request) self.client_registry[node_id].state = NodeState.active # Safe some data within the handler itself handler.node_id = node_id handler.node_type = NodeType.client # Let the client know that the handshake worked return True return False else: raise NotImplementedError("Unkown node") def notify_shutdown(self): """ Notify a global shutdown to all nodes """ with self.registry_lock.readlock: for node_id in self.node_registry: if self.node_registry[node_id] and self.node_registry[node_id].proxy: try: self.node_registry[node_id].proxy.master_disconnected() except: pass with self.client_registry_lock.readlock: for node_id in self.client_registry: if self.client_registry[node_id] and self.client_registry[node_id].proxy: try: self.client_registry[node_id].proxy.master_disconnected() except: pass def heartbeat(self, node_id, node_type): """ We just received a nice beat from a node, update it's last heartbeat timestamp to perevent timeouts """ if NodeType.slave == node_type: with self.registry_lock.writelock: if node_id in self.node_registry: self.node_registry[node_id].heartbeat = time.time() if self.node_registry[node_id].state == NodeState.inactive: self.node_registry[node_id].state = NodeState.active #self.log.info("Node %s just ticked" % (node_id)) return True return False elif NodeType.client == node_type: with self.client_registry_lock.writelock: if node_id in self.client_registry: self.client_registry[node_id].heartbeat = time.time() if self.client_registry[node_id].state == NodeState.inactive: self.client_registry[node_id].state = NodeState.active #self.log.info("Node %s just ticked" % (node_id)) return True return False else: raise NotImplementedError("Unkown node") def rpc_call_failed(self, proxy, method, reason): """ Called when an RPC call failed for an unexpected reason """ self.log.info("Method %s failed because of %s" % (method, reason)) def rpc_call_success(self, proxy, method, result): """ Called when an RPC call succeded """ self.log.info("Method %s succeded with %s" % (method, result)) return result def push_tasksystem(self, request, tasksystem): """ We received a task system from a client. Get the first list of tasks and save out the system itself for later access """ # Easier access node_id = request # Now get the with self.tasksystem_lock.writelock: # No re-registering! system_id = tasksystem.system_id if system_id in self.tasksystem_registry: return False # Safe out the registry system_entry = self.tasksystem_registry[system_id] = self._default_tasksystem() system_entry.system = tasksystem system_entry.client_id = node_id system_entry.system_id = system_id # Now gather task and push them to the system system_entry.system.log = self.log system_entry.system.init_system(self) self.task_scheduler.start_system(system_entry.system) return True def push_task(self, request, task): """ We received a task from a client, add it to the system to be processed """ if isinstance(task, Task): self.task_scheduler.push_task(task) return True return False def task_finished(self, task, result, error): """ Called when a task has finished its computation, the result object contains the task, the result or an error and additional information """ # if the task does not specify a ITaskSystem id its a single executed task which is not controller by # a dedicated autonomouse system on the master if task.system_id is None: client_id = task.client_id with self.client_registry_lock.readlock: if client_id in self.client_registry: self.client_registry[client_id].tcp_proxy.task_finished(task.task_id, result, error) else: # If we do have a system id let it process it instead with self.tasksystem_lock.writelock: if task.system_id in self.tasksystem_registry: system_entry = self.tasksystem_registry[task.system_id] system_entry.system.task_finished(self, task, result, error) # Inform scheduler of the task self.task_scheduler.task_finished(task, result, error) # Check for end if system_entry.system.is_complete(self): try: # Gather results final_results = system_entry.system.gather_result(self) # Send to client proxy the results client_id = system_entry.client_id with self.client_registry_lock.readlock: if client_id in self.client_registry: self.client_registry[client_id].tcp_proxy.work_finished(final_results, system_entry.system.system_id) finally: del self.tasksystem_registry[task.system_id]