def communicate(self, input=None): """Similar to Popen's communicate. Must be used with 'yield' as 'stdout, stderr = yield async_pipe.communicate()' 'input' must be either data or an object with 'read' method (i.e., regular file object or AsyncFile object). """ def write_proc(fd, input, coro=None): size = 16384 if isinstance(input, str) or isinstance(input, bytes): n = yield fd.write(input, full=True) if n != len(input): raise IOError('write failed') else: # TODO: how to know if 'input' is file object for # on-disk file? if hasattr(input, 'seek') and hasattr(input, 'fileno'): read_func = partial_func(os.read, input.fileno()) else: read_func = input.read while True: data = yield read_func(size) if not data: break if isinstance(data, str): data = data.encode() n = yield fd.write(data, full=True) if n != len(data): raise IOError('write failed') input.close() fd.close() def read_proc(fd, coro=None): size = 16384 buflist = [] while True: buf = yield fd.read(size) if not buf: break buflist.append(buf) fd.close() data = b''.join(buflist) raise StopIteration(data) if self.stdout: stdout_coro = Coro(read_proc, self.stdout) if self.stderr: stderr_coro = Coro(read_proc, self.stderr) if input and self.stdin: stdin_coro = Coro(write_proc, self.stdin, input) yield stdin_coro.finish() raise StopIteration( (yield stdout_coro.finish()) if self.stdout else None, (yield stderr_coro.finish()) if self.stderr else None)
def map_results(self, gen, iter): """Execute generator 'gen' with arguments from given iterable. The return value is list of results that correspond to executing 'gen' with arguments in iterable in the same order. Must be used with 'yield', as for example, 'results = yield scheduler.map_results(generator, list_of_tuples)'. """ def exec_proc(gen, *args): yield self.execute(gen, *args) coros = [] append_coro = coros.append for params in iter: if not isinstance(params, tuple): if hasattr(params, '__iter__'): params = tuple(params) else: params = (params,) append_coro(Coro(exec_proc, gen, *params)) results = [None] * len(coros) for i, coro in enumerate(coros): result = yield coro.finish() results[i] = result raise StopIteration(results)
def finish(self, close=False): """Wait until all scheduled coroutines finish. If 'close' is True, the computation is closed as well. Must be used with 'yield' as 'yield job_scheduler.finish()'. """ self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: if self._proc_close: coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location) for location in self._close_servers] self._close_servers = {} for coro in coros: yield coro.finish() else: self._close_servers = {} self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: yield self.computation.close() self._askew_results.clear()
def setup_proc(self, msg, coro=None): if self._remote_scheduler: yield self.asyncoro.peer(msg.info) if (yield Coro(self._proc_available, msg.info).finish()) == 0: self._close_servers[msg.info] = msg.info self._servers[msg.info] = msg.info self._server_avail.set()
def __init__(self): self._services = {} logger.setLevel(logging.INFO) # thread pool -- will burn up if services use the thread for blocking & totally # kill application communication. If you have to block for I/O then you better # be using async in the destination service #for i in range(2 * multiprocessing.cpu_count()) : #self._coro_dispatcher = \ self._dispatcher_coro = Coro(self._message_dispatcher)
def coro1(coro=None): # if server is on remote network, automatic discovery won't work, # so add it explicitly # yield scheduler.peer('192.168.21.5') # find where 'rci_1' is registered; alternately, location can be # explicitly created with asyncoro.Location or obtained with # 'locate_peer' etc. rloc = yield scheduler.locate_RCI('rci_1', timeout=2) if not rloc: raise Exception('failed') n = 5 monitor = Coro(monitor_proc, n) for x in range(n): rcoro = yield scheduler.run_RCI(rloc, 'rci_1', 'test%s' % x, b=x) logger.debug('rcoro: %s/%s', rcoro.name, rcoro._id) # set 'm' as monitor for this coroutine yield monitor.monitor(rcoro) # send credentials rcoro.send('msg:%s' % x) yield coro.sleep(random.uniform(0, 1))
def __init__(self, message_router): super(Node_Service, self).__init__(SERVICE_NODE, message_router, cl_name="ns") self.exit = False self.pause_scheduler = False self.nodes = {} self.queue = deque() self.delay_queue = deque() self.scheduler_thread = Thread(target=self._thread_scheduler) self.scheduler_thread.daemon = True self.scheduler_thread.start() self._stabilize_coro = Coro(self._coro_stabilize)
def _server_connect(self, coro=None): try: #logger.debug('CLIENT: connecting to peer at %s:%s', self.remote_ip, str(self.remote_port)) self.outbound_socket = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) self.outbound_socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) # if you're gonna act like UDP yield self.outbound_socket.connect((self.remote_ip, self.remote_port)) #logger.debug('CLIENT: connected to peer at %s:%s', self.remote_ip, str(self.remote_port)) self._send_coro = Coro(self._client_send) #Coro(self._client_recv) # unneeded if we don't utilize bi-directional communication in UDP style messaging self.network_service.on_server_connect(self, self.context) except: show_error()
def communicate(self, input=None): """Similar to Popen's communicate. Must be used with 'yield' as 'stdout, stderr = yield async_pipe.communicate()' 'input' must be either data or an object with 'read' method (i.e., regular file object or AsyncFile object). """ def write_proc(fd, input, coro=None): size = 16384 if isinstance(input, str) or isinstance(input, bytes): n = yield fd.write(input, full=True) if n != len(input): raise IOError('write failed') else: # TODO: how to know if 'input' is file object for # on-disk file? if hasattr(input, 'seek') and hasattr(input, 'fileno'): read_func = partial_func(os.read, input.fileno()) else: read_func = input.read while 1: data = yield read_func(size) if not data: break if isinstance(data, str): data = data.encode() n = yield fd.write(data, full=True) if n != len(data): raise IOError('write failed') input.close() fd.close() def read_proc(fd, coro=None): size = 16384 buflist = [] while 1: buf = yield fd.read(size) if not buf: break buflist.append(buf) fd.close() data = b''.join(buflist) raise StopIteration(data) if self.stdout: stdout_coro = Coro(read_proc, self.stdout) if self.stderr: stderr_coro = Coro(read_proc, self.stderr) if input and self.stdin: stdin_coro = Coro(write_proc, self.stdin, input) yield stdin_coro.finish() raise StopIteration((yield stdout_coro.finish()) if self.stdout else None, (yield stderr_coro.finish()) if self.stderr else None)
def status_proc(self, msg, coro=None): if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0: self._servers[msg.info] = msg.info self._server_avail.set()
def setup_proc(self, msg, coro=None): if (yield Coro(self._proc_available, msg.info).finish()) == 0: self._close_servers[msg.info] = msg.info self._servers[msg.info] = msg.info self._server_avail.set()
def _status_proc(self, coro=None): """Internal use only. Coroutine to process discoro scheduler messages. """ coro.set_daemon() coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value()) while 1: msg = yield coro.receive() if isinstance(msg, asyncoro.MonitorException): if msg.args[1][0] == discoro.Scheduler.ServerClosed: continue rcoro = msg.args[0] client, use_count = self._rcoros.pop(rcoro, ('missing', 0)) if client is None: pass elif isinstance(client, Coro): client._proceed_(msg.args[1][1]) elif client == 'missing': # Due to 'yield' used to create rcoro, scheduler may not # have updated self._rcoros before the coroutine's # MonitorException is received, so put it in # 'askew_results'. The scheduling coroutine will resend it # when it receives rcoro self._askew_results[rcoro] = msg continue else: asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored') continue if use_count: self._servers[rcoro.location] = rcoro.location self._server_avail.set() if not self._rcoros: self._rcoros_done.set() elif isinstance(msg, DiscoroStatus): if msg.status == discoro.Scheduler.ServerInitialized: if self._proc_available: def setup_proc(self, msg, coro=None): if (yield Coro(self._proc_available, msg.info).finish()) == 0: self._close_servers[msg.info] = msg.info self._servers[msg.info] = msg.info self._server_avail.set() Coro(setup_proc, self, msg) elif self._proc_status: def status_proc(self, msg, coro=None): if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0: self._servers[msg.info] = msg.info self._server_avail.set() Coro(status_proc, self, msg) else: self._servers[msg.info] = msg.info self._server_avail.set() elif msg.status == discoro.Scheduler.ServerClosed: self._servers.pop(msg.info, None) if self._close_servers.pop(msg.info, None) and self._proc_close: Coro(self._proc_close, msg.status, msg.info) elif self._proc_status: Coro(self._proc_status, msg.status, msg.info) elif msg.status == discoro.Scheduler.ComputationScheduled: self.computation_sign = msg.info if self._proc_status: Coro(self._proc_status, msg.status, msg.info) elif (msg.status == discoro.Scheduler.ComputationClosed and msg.info == self.computation_sign): if self._proc_status: Coro(self._proc_status, msg.status, msg.info) raise StopIteration elif msg.status != discoro.Scheduler.CoroCreated: if self._proc_status: Coro(self._proc_status, msg.status, msg.info)
class RemoteCoroScheduler(object): """Scheduler for submitting computation jobs. When coroutines are created with 'run' methods of Computation instances, they are created with a load-balancing algorithm on available servers with no limit on how many coroutines are run at a server. This works when coroutines are not CPU bound always. If, however, coroutines are computations (CPU bound always/mostly), then it may be more appropriate to schedule one coroutine at a server so creating a new coroutine waits until a server becomes available. RemoteCoroScheduler schedules at most one computation (coroutine) at a server process at any time (so a node may execute as many computation coroutines as there are server processes running on that node, but not more). See 'discomp*.py' files in 'examples' directory for some use cases. NB: When using this scheduler, 'run' method of computation shouldn't be used to create (remote) coroutines (unless those don't take up CPU), as this scheduler is not aware of those. """ def __init__(self, computation, proc_status=None, proc_available=None, proc_close=None): """'computation' should be an instance of discoro.Computation 'proc_status' if not None should be a generator function that is called (as coroutine) with the status and info, as received by status_coro. If status is ServerInitialized and this function returns non-zero value, the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute' will not use that server. 'proc_available' if not None should be a generator function that is called (as coroutine) with the location of a server process when it becomes available (after all 'depends' of computation have been transferred). The coroutine runs at the client; it can create remote coroutine(s) at the server process, perhaps to setup, such as initializing global variables, transfer additional files etc. The coroutine should exit with 0 to indicate successful setup; any other value is interpretted as failure and not used by scheduler. 'proc_close' if not None should be a generator function that is called (as coroutine) with the status and location of server process when server is about to be closed, or already closed. The coroutine runs at the client; it can create remote coroutine(s) at server process to cleanup, such as delete global variables, transfer files back to client etc. The coroutine is called with two parameters: 'status', which is either 'discoro.Scheduler.ServerInitialized' when server is about to be closed (i.e., server is still available, and remote coroutines can be executed), or 'discoro.Scheduler.ServerClosed' when server is already closed (e.g., due to zombie_period time elapsed without communication, or server was manually closed with command-line etc.), and 'location' of server process. """ if proc_status: if not inspect.isgeneratorfunction(proc_status): asyncoro.logger.warning('Invalid proc_status ignored') proc_status = None if proc_available: if not inspect.isgeneratorfunction(proc_available): asyncoro.logger.warning('Invalid proc_available ignored') proc_available = None if proc_close: if not inspect.isgeneratorfunction(proc_close): asyncoro.logger.warning('Invalid proc_close ignored') proc_close = None self._proc_status = proc_status self._proc_available = proc_available self._proc_close = proc_close self._close_servers = {} self.computation = computation self.computation_sign = None self.status_coro = Coro(self._status_proc) if not computation.status_coro: computation.status_coro = self.status_coro self._rcoros = {} self._rcoros_done = asyncoro.Event() self._askew_results = {} self._servers = {} self._server_avail = asyncoro.Event() Coro(computation.schedule) def schedule(self, gen, *args, **kwargs): """Similar to 'run' method of computation, except as noted above: This method will block until a server process is available (i.e., not running another computation). Must be used with 'yield', similar to 'run' method of Computation instance. """ while not self._servers: self._server_avail.clear() yield self._server_avail.wait() sloc, loc = self._servers.popitem() rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs) if isinstance(rcoro, Coro): self._rcoros[rcoro] = (None, 1) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) else: self._servers[sloc] = loc self._server_avail.set() raise StopIteration(rcoro) def execute(self, gen, *args, **kwargs): """Similar to 'run' method of computation, except as noted above: The caller (client coroutine) will block until a server process is available (i.e., not running another computation), where remote coroutine with given 'gen', 'args' and 'kwargs' runs and finishes. The return value is the result of computation. Must be used with 'yield', similar to 'run' method of Computation instance. """ while not self._servers: self._server_avail.clear() yield self._server_avail.wait() sloc, loc = self._servers.popitem() rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs) if isinstance(rcoro, Coro): client = asyncoro.AsynCoro.cur_coro() self._rcoros[rcoro] = (client, 1) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) client._await_() else: self._servers[sloc] = loc self._server_avail.set() raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro))) def execute_at(self, where, gen, *args, **kwargs): """Similar to 'run_at' method of computation, except the calling coroutine is blocked until the computation finishes and exit value of computation is returned. Unlike 'execute', the computation is executed right away, even if remote server process is executing another computation. Must be used with 'yield', similar to 'run_at' method of Computation instance. """ rcoro = yield self.computation.run_at(where, gen, *args, **kwargs) if isinstance(rcoro, Coro): client = asyncoro.AsynCoro.cur_coro() self._rcoros[rcoro] = (client, 0) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) client._await_() else: raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro))) def map_results(self, gen, iter): """Execute generator 'gen' with arguments from given iterable. The return value is list of results that correspond to executing 'gen' with arguments in iterable in the same order. Must be used with 'yield', as for example, 'results = yield scheduler.map_results(generator, list_of_tuples)'. """ def exec_proc(gen, *args): yield self.execute(gen, *args) coros = [] append_coro = coros.append for params in iter: if not isinstance(params, tuple): if hasattr(params, '__iter__'): params = tuple(params) else: params = (params,) append_coro(Coro(exec_proc, gen, *params)) results = [None] * len(coros) for i, coro in enumerate(coros): result = yield coro.finish() results[i] = result raise StopIteration(results) def submit_at(self, where, gen, *args, **kwargs): """Similar to 'run_at' method of computation. If 'where' is None, the calling coroutine is blocked until any server is discovered and initialized (so computation's 'run_at' will not fail). Unlike 'schedule', this method doesn't wait for server to be free (i.e., not running any other coroutines), nor unlike 'execute_at', the caller is not blocked until the coroutine finishes. Must be used with 'yield', similar to 'run_at' method of Computation instance. The value returned is result of 'run_at' method of computation (reference to remote coroutine in case of success, and error otherwise). """ if not where: if not self._servers and not self._rcoros: yield self._server_avail.wait() rcoro = yield self.computation.run_at(where, gen, *args, **kwargs) if isinstance(rcoro, Coro): self._rcoros[rcoro] = (None, 0) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) raise StopIteration(rcoro) def submit(self, gen, *args, **kwargs): """Submit coroutine at any server; see 'submit_at' above. """ yield self.submit_at(None, gen, *args, **kwargs) def finish(self, close=False): """Wait until all scheduled coroutines finish. If 'close' is True, the computation is closed as well. Must be used with 'yield' as 'yield job_scheduler.finish()'. """ self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: if self._proc_close: coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location) for location in self._close_servers] self._close_servers = {} for coro in coros: yield coro.finish() else: self._close_servers = {} self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: yield self.computation.close() self._askew_results.clear() def _status_proc(self, coro=None): """Internal use only. Coroutine to process discoro scheduler messages. """ coro.set_daemon() coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value()) while 1: msg = yield coro.receive() if isinstance(msg, asyncoro.MonitorException): if msg.args[1][0] == discoro.Scheduler.ServerClosed: continue rcoro = msg.args[0] client, use_count = self._rcoros.pop(rcoro, ('missing', 0)) if client is None: pass elif isinstance(client, Coro): client._proceed_(msg.args[1][1]) elif client == 'missing': # Due to 'yield' used to create rcoro, scheduler may not # have updated self._rcoros before the coroutine's # MonitorException is received, so put it in # 'askew_results'. The scheduling coroutine will resend it # when it receives rcoro self._askew_results[rcoro] = msg continue else: asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored') continue if use_count: self._servers[rcoro.location] = rcoro.location self._server_avail.set() if not self._rcoros: self._rcoros_done.set() elif isinstance(msg, DiscoroStatus): if msg.status == discoro.Scheduler.ServerInitialized: if self._proc_available: def setup_proc(self, msg, coro=None): if (yield Coro(self._proc_available, msg.info).finish()) == 0: self._close_servers[msg.info] = msg.info self._servers[msg.info] = msg.info self._server_avail.set() Coro(setup_proc, self, msg) elif self._proc_status: def status_proc(self, msg, coro=None): if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0: self._servers[msg.info] = msg.info self._server_avail.set() Coro(status_proc, self, msg) else: self._servers[msg.info] = msg.info self._server_avail.set() elif msg.status == discoro.Scheduler.ServerClosed: self._servers.pop(msg.info, None) if self._close_servers.pop(msg.info, None) and self._proc_close: Coro(self._proc_close, msg.status, msg.info) elif self._proc_status: Coro(self._proc_status, msg.status, msg.info) elif msg.status == discoro.Scheduler.ComputationScheduled: self.computation_sign = msg.info if self._proc_status: Coro(self._proc_status, msg.status, msg.info) elif (msg.status == discoro.Scheduler.ComputationClosed and msg.info == self.computation_sign): if self._proc_status: Coro(self._proc_status, msg.status, msg.info) raise StopIteration elif msg.status != discoro.Scheduler.CoroCreated: if self._proc_status: Coro(self._proc_status, msg.status, msg.info)
class RemoteCoroScheduler(object): """Scheduler for submitting computation jobs. When coroutines are created with 'run' methods of Computation instances, they are created with a load-balancing algorithm on available servers with no limit on how many coroutines are run at a server. This works when coroutines are not CPU bound always. If, however, coroutines are computations (CPU bound always/mostly), then it may be more appropriate to schedule one coroutine at a server so creating a new coroutine waits until a server becomes available. RemoteCoroScheduler schedules at most one computation (coroutine) at a server process at any time (so a node may execute as many computation coroutines as there are server processes running on that node, but not more). See 'discomp*.py' files in 'examples' directory for some use cases. NB: When using this scheduler, 'run' method of computation shouldn't be used to create (remote) coroutines (unless those don't take up CPU), as this scheduler is not aware of those. """ def __init__(self, computation, status=None, node_available=None, proc_available=None, proc_close=None): """'computation' should be an instance of discoro.Computation 'status' if not None should be a generator function that is called (as coroutine) with the status and info, as received by status_coro. If status is ServerInitialized and this function returns non-zero value, the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute' will not use that server. 'proc_available' if not None should be a generator function that is called (as coroutine) with the location of a server process when it becomes available (after all 'depends' of computation have been transferred). The coroutine runs at the client; it can create remote coroutine(s) at the server process, perhaps to setup, such as initializing global variables, transfer additional files etc. The coroutine should exit with 0 to indicate successful setup; any other value is interpretted as failure and not used by scheduler. 'proc_close' if not None should be a generator function that is called (as coroutine) with the status and location of server process when server is about to be closed, or already closed. The coroutine runs at the client; it can create remote coroutine(s) at server process to cleanup, such as delete global variables, transfer files back to client etc. The coroutine is called with two parameters: 'status', which is either 'discoro.Scheduler.ServerInitialized' when server is about to be closed (i.e., server is still available, and remote coroutines can be executed), or 'discoro.Scheduler.ServerClosed' when server is already closed (e.g., due to zombie_period time elapsed without communication, or server was manually closed with command-line etc.), and 'location' of server process. """ if status: if not inspect.isgeneratorfunction(status): asyncoro.logger.warning('Invalid status ignored') status = None if proc_available: if not inspect.isgeneratorfunction(proc_available): asyncoro.logger.warning('Invalid proc_available ignored') proc_available = None if proc_close: if not inspect.isgeneratorfunction(proc_close): asyncoro.logger.warning('Invalid proc_close ignored') proc_close = None if not node_available and computation._node_available: node_available = computation._node_available if node_available: if not inspect.isgeneratorfunction(node_available): asyncoro.logger.warning('Invalid node_available ignored') node_available = None self._status = status self._proc_available = proc_available self._proc_close = proc_close self._node_available = node_available self._close_servers = {} self.computation = computation self.computation_sign = None self.status_coro = Coro(self._status_proc) if isinstance(computation.status_coro, Coro): def chain_status_msgs(status_coro, client, coro=None): coro.set_daemon() while True: msg = yield coro.receive() client.send(msg) status_coro.send(msg) computation.status_coro = Coro(chain_status_msgs, self.status_coro, computation.status_coro) else: computation.status_coro = self.status_coro self._rcoros = {} self._rcoros_done = asyncoro.Event() self._askew_results = {} self._servers = {} self._server_avail = asyncoro.Event() self._remote_scheduler = False self.asyncoro = asyncoro.AsynCoro() Coro(computation.schedule) def schedule(self, gen, *args, **kwargs): """Similar to 'run' method of computation, except as noted above: This method will block until a server process is available (i.e., not running another computation). Must be used with 'yield', similar to 'run' method of Computation instance. """ while not self._servers: self._server_avail.clear() yield self._server_avail.wait() sloc, loc = self._servers.popitem() rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs) if isinstance(rcoro, Coro): self._rcoros[rcoro] = (None, 1) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) else: self._servers[sloc] = loc self._server_avail.set() raise StopIteration(rcoro) def execute(self, gen, *args, **kwargs): """Similar to 'run' method of computation, except as noted above: The caller (client coroutine) will block until a server process is available (i.e., not running another computation), where remote coroutine with given 'gen', 'args' and 'kwargs' runs and finishes. The return value is the result of computation. Must be used with 'yield', similar to 'run' method of Computation instance. """ while not self._servers: self._server_avail.clear() yield self._server_avail.wait() sloc, loc = self._servers.popitem() rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs) if isinstance(rcoro, Coro): client = asyncoro.AsynCoro.cur_coro() self._rcoros[rcoro] = (client, 1) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) client._await_() else: self._servers[sloc] = loc self._server_avail.set() raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro))) def execute_at(self, where, gen, *args, **kwargs): """Similar to 'run_at' method of computation, except the calling coroutine is blocked until the computation finishes and exit value of computation is returned. Unlike 'execute', the computation is executed right away, even if remote server process is executing another computation. Must be used with 'yield', similar to 'run_at' method of Computation instance. """ rcoro = yield self.computation.run_at(where, gen, *args, **kwargs) if isinstance(rcoro, Coro): client = asyncoro.AsynCoro.cur_coro() self._rcoros[rcoro] = (client, 0) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) client._await_() else: raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro))) def map_results(self, gen, iter): """Execute generator 'gen' with arguments from given iterable. The return value is list of results that correspond to executing 'gen' with arguments in iterable in the same order. Must be used with 'yield', as for example, 'results = yield scheduler.map_results(generator, list_of_tuples)'. """ def exec_proc(gen, *args): yield self.execute(gen, *args) coros = [] append_coro = coros.append for params in iter: if not isinstance(params, tuple): if hasattr(params, '__iter__'): params = tuple(params) else: params = (params,) append_coro(Coro(exec_proc, gen, *params)) results = [None] * len(coros) for i, coro in enumerate(coros): result = yield coro.finish() results[i] = result raise StopIteration(results) def submit_at(self, where, gen, *args, **kwargs): """Similar to 'run_at' method of computation. If 'where' is None, the calling coroutine is blocked until any server is discovered and initialized (so computation's 'run_at' will not fail). Unlike 'schedule', this method doesn't wait for server to be free (i.e., not running any other coroutines), nor unlike 'execute_at', the caller is not blocked until the coroutine finishes. Must be used with 'yield', similar to 'run_at' method of Computation instance. The value returned is result of 'run_at' method of computation (reference to remote coroutine in case of success, and error otherwise). """ if not where: if not self._servers and not self._rcoros: yield self._server_avail.wait() rcoro = yield self.computation.run_at(where, gen, *args, **kwargs) if isinstance(rcoro, Coro): self._rcoros[rcoro] = (None, 0) if self._askew_results: msg = self._askew_results.pop(rcoro, None) if msg: self.status_coro.send(msg) raise StopIteration(rcoro) def submit(self, gen, *args, **kwargs): """Submit coroutine at any server; see 'submit_at' above. """ yield self.submit_at(None, gen, *args, **kwargs) def finish(self, close=False): """Wait until all scheduled coroutines finish. If 'close' is True, the computation is closed as well. Must be used with 'yield' as 'yield job_scheduler.finish()'. """ self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: if self._proc_close: coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location) for location in self._close_servers] self._close_servers = {} for coro in coros: yield coro.finish() else: self._close_servers = {} self._rcoros_done.clear() if self._rcoros: yield self._rcoros_done.wait() if close: yield self.computation.close() self._askew_results.clear() def _status_proc(self, coro=None): """Internal use only. Coroutine to process discoro scheduler messages. """ coro.set_daemon() coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value()) while 1: msg = yield coro.receive() if isinstance(msg, asyncoro.MonitorException): if msg.args[1][0] == discoro.Scheduler.ServerClosed: continue rcoro = msg.args[0] client, use_count = self._rcoros.pop(rcoro, ('missing', 0)) if client is None: pass elif isinstance(client, Coro): client._proceed_(msg.args[1][1]) elif client == 'missing': # Due to 'yield' used to create rcoro, scheduler may not # have updated self._rcoros before the coroutine's # MonitorException is received, so put it in # 'askew_results'. The scheduling coroutine will resend it # when it receives rcoro self._askew_results[rcoro] = msg continue else: asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored') continue if use_count: self._servers[rcoro.location] = rcoro.location self._server_avail.set() if not self._rcoros: self._rcoros_done.set() elif isinstance(msg, DiscoroStatus): if msg.status == discoro.Scheduler.ServerInitialized: if self._proc_available: def setup_proc(self, msg, coro=None): if self._remote_scheduler: yield self.asyncoro.peer(msg.info) if (yield Coro(self._proc_available, msg.info).finish()) == 0: self._close_servers[msg.info] = msg.info self._servers[msg.info] = msg.info self._server_avail.set() Coro(setup_proc, self, msg) elif self._status: def status_proc(self, msg, coro=None): if (yield Coro(self._status, msg.status, msg.info).finish()) == 0: self._servers[msg.info] = msg.info self._server_avail.set() Coro(status_proc, self, msg) else: self._servers[msg.info] = msg.info self._server_avail.set() elif msg.status == discoro.Scheduler.ServerClosed: self._servers.pop(msg.info, None) if self._close_servers.pop(msg.info, None) and self._proc_close: Coro(self._proc_close, msg.status, msg.info) elif self._status: Coro(self._status, msg.status, msg.info) elif msg.status == discoro.Scheduler.NodeDiscovered: if self._node_available: def setup_node(self, msg, coro=None): if self._remote_scheduler: yield self.asyncoro.peer(msg.info.location) try: params = yield asyncoro.Coro(self._node_available, msg.info).finish() except: raise StopIteration if not isinstance(params, tuple): if hasattr(params, '__iter__'): params = tuple(params) else: params = (params,) msg = {'req': 'setup_node', 'addr': msg.info.location.addr, 'params': params, 'auth': self.computation._auth, 'client': coro} self.computation.scheduler.send(msg) Coro(setup_node, self, msg) elif msg.status == discoro.Scheduler.ComputationScheduled: self.computation_sign = msg.info if self.computation.scheduler.location != self.asyncoro.location: self._remote_scheduler = True if self._status: Coro(self._status, msg.status, msg.info) elif (msg.status == discoro.Scheduler.ComputationClosed and msg.info == self.computation_sign): if self._status: Coro(self._status, msg.status, msg.info) raise StopIteration elif msg.status != discoro.Scheduler.CoroCreated: if self._status: Coro(self._status, msg.status, msg.info)
def __init__(self, cpus, ip_addr=None, ext_ip_addr=None, node_port=None, scheduler_node=None, scheduler_port=None, dest_path_prefix='', secret='', keyfile=None, certfile=None, max_file_size=None, zombie_interval=60): assert 0 < cpus <= multiprocessing.cpu_count() self.cpus = cpus if ip_addr: ip_addr = _node_ipaddr(ip_addr) if not ip_addr: raise Exception('invalid ip_addr') else: self.name = socket.gethostname() ip_addr = socket.gethostbyname(self.name) if ext_ip_addr: ext_ip_addr = _node_ipaddr(ext_ip_addr) if not ext_ip_addr: raise Exception('invalid ext_ip_addr') else: ext_ip_addr = ip_addr try: self.name = socket.gethostbyaddr(ext_ip_addr)[0] except: self.name = socket.gethostname() if not node_port: node_port = 51348 if not scheduler_port: scheduler_port = 51347 self.ip_addr = ip_addr self.ext_ip_addr = ext_ip_addr self.scheduler_port = scheduler_port self.pulse_interval = None self.keyfile = keyfile self.certfile = certfile if self.keyfile: self.keyfile = os.path.abspath(self.keyfile) if self.certfile: self.certfile = os.path.abspath(self.certfile) self.asyncoro = AsynCoro() self.tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if self.certfile: self.tcp_sock = ssl.wrap_socket(self.tcp_sock, keyfile=self.keyfile, certfile=self.certfile) self.tcp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.tcp_sock.bind((self.ip_addr, node_port)) self.address = self.tcp_sock.getsockname() self.tcp_sock.listen(30) if dest_path_prefix: self.dest_path_prefix = dest_path_prefix.strip().rstrip(os.sep) else: self.dest_path_prefix = os.path.join(os.sep, 'tmp', 'dispy') if not os.path.isdir(self.dest_path_prefix): os.makedirs(self.dest_path_prefix) os.chmod(self.dest_path_prefix, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) if max_file_size is None: max_file_size = MaxFileSize self.max_file_size = max_file_size self.avail_cpus = self.cpus self.computations = {} self.scheduler_ip_addr = None self.file_uses = {} self.job_infos = {} self.lock = asyncoro.Lock() self.terminate = False self.signature = os.urandom(20).encode('hex') self.auth_code = hashlib.sha1(self.signature + secret).hexdigest() self.zombie_interval = 60 * zombie_interval logger.debug('auth_code for %s: %s', ip_addr, self.auth_code) self.udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.udp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.udp_sock.bind(('', node_port)) logger.info('serving %s cpus at %s:%s', self.cpus, self.ip_addr, node_port) logger.debug('tcp server at %s:%s', self.address[0], self.address[1]) self.udp_sock = AsynCoroSocket(self.udp_sock, blocking=False) scheduler_ip_addr = _node_ipaddr(scheduler_node) self.reply_Q = multiprocessing.Queue() self.reply_Q_thread = threading.Thread(target=self.__reply_Q) self.reply_Q_thread.start() self.timer_coro = Coro(self.timer_task) # self.tcp_coro = Coro(self.tcp_server) self.udp_coro = Coro(self.udp_server, scheduler_ip_addr)
class Message_Router(): _instance = None @classmethod def instance(cls): if not cls._instance: cls._instance = Message_Router( ) return cls._instance _commands = None exit = False def __init__(self): self._services = {} logger.setLevel(logging.INFO) # thread pool -- will burn up if services use the thread for blocking & totally # kill application communication. If you have to block for I/O then you better # be using async in the destination service #for i in range(2 * multiprocessing.cpu_count()) : #self._coro_dispatcher = \ self._dispatcher_coro = Coro(self._message_dispatcher) def _message_dispatcher(self, coro=None): coro.set_daemon() thread_pool = AsynCoroThreadPool(2 * multiprocessing.cpu_count()) while True: try: message = yield coro.receive() if self.exit: #abandon any work & just cleanly exit break yield thread_pool.async_task(coro, self._dispatch_message, message) except: show_error() #raise print "Coro(_message_dispatcher) exiting" def _dispatch_message(self,message): if message.dest_service in self._services.keys(): sw = Stopwatch() self._services[message.dest_service].handle_message(message) #if message.type == Message_Recv_Peer_Data.Type(): # logger.debug( "ROUTER: net receiving " + message.network_msg.Type()) #elif message.type == Message_Send_Peer_Data.Type(): # logger.debug( "ROUTER: net dispatching " + message.network_msg.Type()) #else: # logger.debug( "ROUTER: dispatching " + message.Type()) # for long running tasks (CPU-bound) you should pass off to a dedicated thread or tune # for I/O bound tasks you should queue the work until an I/O thread-pool thread can handle it if sw.ms() > 50: print "INVESTIGATE!!! %s(%s) took %0.3f ms!! Tuning may be required.'" % (message.dest_service, message.Type(), sw.ms()) else: print "Unregistered service '" + message.service + "'" def register_service(self, service_id, service): if not service_id in self._services.keys(): self._services[service_id] = service def route(self, message): self._dispatch_message(message) #if len(self._dispatcher_coro._msgs) > 100: # print "Backlog is " + str(len(self._dispatcher_coro._msgs)) + " on message dispatch!!! Find blocking service." #if not self.exit: # self._dispatcher_coro.send(message) def stop(self): self.exit = True # tell all my services to stop for service in self._services.values(): try: service.stop() except: show_error() self._dispatcher_coro.send(None) time.sleep(.1) AsynCoro.instance().terminate() def attach_console(self): while True: try: cmd = raw_input() except EOFError: #the user does not have a terminal return if cmd == "q" or cmd == "Q" or cmd == "quit" or cmd == "exit": print "Exiting..." break try: node_info = None if SERVICE_NODE in self._services.keys(): node_info = self._services[SERVICE_NODE].get_console_node( ) args = [] splitted = cmd.split(' ',1) if len(cmd) == 0: # default action self.route(Message_Console_Command(SERVICE_NODE, "print", args, node_info)) elif len(splitted) == 1: # try to find a service with the command for svc in self._services.values(): if splitted[0] in svc.attach_to_console( ): splitted.insert(0,svc.service_id) break if len(splitted) > 1: svcName = splitted[0] # consoleName command = splitted[1] # commandName # attempt to lookup service by consoleName found = False for svc in self._services.values(): if svc.cl_name == svcName or svc.service_id == svcName: for i in range(2, len(splitted)): args.append( splitted[i] ) svc.handle_message(Message_Console_Command(svc.service_id, command, args, node_info)) found = True break if not found: # see if a command was entered without a service name for svc in self._services.values(): if svcName in svc.attach_to_console(): command = svcName for i in range(1, len(splitted)): args.append( splitted[i] ) svc.handle_message(Message_Console_Command(svc.service_id, command, args, node_info)) found = True break if not found: print svcName + " is an unregistered service." except: show_error()
def _discoro_proc(): # coroutine """Server process receives computations and runs coroutines for it. """ import os import shutil import traceback import sys import time try: import psutil except: psutil = None import asyncoro.disasyncoro as asyncoro from asyncoro import Coro from asyncoro.discoro import MinPulseInterval, MaxPulseInterval, \ DiscoroNodeInfo, DiscoroNodeStatus _discoro_coro = asyncoro.AsynCoro.cur_coro() _discoro_config = yield _discoro_coro.receive() assert _discoro_config['req'] == 'config' _discoro_coro.register('discoro_server') _discoro_name = asyncoro.AsynCoro.instance().name asyncoro.AsynCoro.instance().dest_path = os.path.join('discoro', 'server%s' % (_discoro_config['id'])) _discoro_dest_path = asyncoro.AsynCoro.instance().dest_path _discoro_pid_path = os.path.join(_discoro_dest_path, '..', 'server%s.pid' % (_discoro_config['id'])) _discoro_pid_path = os.path.normpath(_discoro_pid_path) # TODO: is file locking necessary? if os.path.exists(_discoro_pid_path): with open(_discoro_pid_path, 'r') as _discoro_req: _discoro_var = _discoro_req.read() _discoro_var = int(_discoro_var) if not _discoro_config['phoenix']: print('\n Another discoronode seems to be running;\n' ' make sure server with PID %d quit and remove "%s"\n' % (_discoro_var, _discoro_pid_path)) _discoro_var = os.getpid() import signal try: os.kill(_discoro_var, signal.SIGTERM) except: pass else: time.sleep(0.1) try: if os.waitpid(_discoro_var, os.WNOHANG)[0] != _discoro_var: asyncoro.logger.warning('Killing process %d failed' % _discoro_var) except: pass del signal if os.path.isdir(_discoro_dest_path): shutil.rmtree(_discoro_dest_path) os.makedirs(_discoro_dest_path) os.chdir(_discoro_dest_path) with open(_discoro_pid_path, 'w') as _discoro_var: _discoro_var.write('%s' % os.getpid()) asyncoro.logger.debug('discoro server "%s" started at %s; ' 'computation files will be saved in "%s"' % (_discoro_name, _discoro_coro.location, _discoro_dest_path)) _discoro_req = _discoro_client = _discoro_auth = _discoro_msg = None _discoro_timer_coro = _discoro_pulse_coro = _discoro_timer_proc = _discoro_peer_status = None _discoro_monitor_coro = _discoro_monitor_proc = _discoro_node_status = None _discoro_computation = _discoro_func = _discoro_var = None _discoro_job_coros = set() _discoro_busy_time = time.time() _discoro_globals = {} _discoro_locals = {} _discoro_globals.update(globals()) _discoro_locals.update(locals()) def _discoro_timer_proc(coro=None): coro.set_daemon() last_pulse = time.time() interval = None while True: reset = yield coro.sleep(interval) if reset: if not isinstance(_discoro_pulse_coro, Coro): interval = None continue interval = reset last_pulse = time.time() continue if not _discoro_pulse_coro: continue msg = {'ncoros': len(_discoro_job_coros), 'location': coro.location} if _discoro_node_status: msg['node_status'] = DiscoroNodeStatus(coro.location.addr, psutil.cpu_percent(), psutil.virtual_memory().percent, psutil.disk_usage(_discoro_dest_path).percent) if _discoro_pulse_coro.send(msg) == 0: last_pulse = time.time() elif (time.time() - last_pulse) > (5 * interval) and _discoro_computation: asyncoro.logger.warning('scheduler is not reachable; closing computation "%s"' % _discoro_computation._auth) _discoro_coro.send({'req': 'close', 'auth': _discoro_computation._auth}) if ((not _discoro_job_coros) and _discoro_computation.zombie_period and ((time.time() - _discoro_busy_time) > _discoro_computation.zombie_period)): asyncoro.logger.debug('%s: zombie computation "%s"' % (coro.location, _discoro_computation._auth)) # TODO: close? For now wait for "too many" timeouts to close def _discoro_peer_status(coro=None): coro.set_daemon() while True: status = yield coro.receive() if isinstance(status, asyncoro.PeerStatus) and \ status.status == asyncoro.PeerStatus.Offline and \ _discoro_pulse_coro and _discoro_pulse_coro.location == status.location: asyncoro.logger.debug('scheduler at %s quit; closing computation %s' % (status.location, _discoro_computation._auth)) msg = {'req': 'close', 'auth': _discoro_computation._auth} _discoro_coro.send(msg) def _discoro_monitor_proc(coro=None): nonlocal _discoro_busy_time coro.set_daemon() while True: msg = yield coro.receive() if isinstance(msg, asyncoro.MonitorException): _discoro_busy_time = time.time() asyncoro.logger.debug('job %s done' % msg.args[0]) _discoro_job_coros.discard(msg.args[0]) else: asyncoro.logger.warning('%s: invalid monitor message ignored' % coro.location) _discoro_timer_coro = Coro(_discoro_timer_proc) _discoro_monitor_coro = Coro(_discoro_monitor_proc) asyncoro.AsynCoro.instance().peer_status(Coro(_discoro_peer_status)) while True: _discoro_msg = yield _discoro_coro.receive() if not isinstance(_discoro_msg, dict): continue _discoro_req = _discoro_msg.get('req', None) if _discoro_req == 'run': _discoro_client = _discoro_msg.get('client', None) _discoro_auth = _discoro_msg.get('auth', None) _discoro_func = _discoro_msg.get('func', None) if not isinstance(_discoro_client, Coro) or not _discoro_computation or \ _discoro_auth != _discoro_computation._auth: asyncoro.logger.warning('invalid run: %s' % (type(_discoro_func))) if isinstance(_discoro_client, Coro): _discoro_client.send(None) continue try: _discoro_func = asyncoro.unserialize(_discoro_func) if _discoro_func.code: exec(_discoro_func.code, globals()) job_coro = Coro(globals()[_discoro_func.name], *(_discoro_func.args), **(_discoro_func.kwargs)) except: asyncoro.logger.debug('invalid computation to run') # _discoro_func = Scheduler._Function(_discoro_func.name, None, # _discoro_func.args, _discoro_func.kwargs) job_coro = (sys.exc_info()[0], getattr(_discoro_func, 'name', _discoro_func), traceback.format_exc()) else: asyncoro.logger.debug('job %s created' % job_coro) _discoro_job_coros.add(job_coro) job_coro.notify(_discoro_monitor_coro) _discoro_var = _discoro_msg.get('notify', None) if isinstance(_discoro_var, Coro): job_coro.notify(_discoro_var) _discoro_busy_time = time.time() _discoro_client.send(job_coro) del job_coro elif _discoro_req == 'setup': _discoro_client = _discoro_msg.get('client', None) _discoro_pulse_coro = _discoro_msg.get('pulse_coro', None) if not isinstance(_discoro_client, Coro) or not isinstance(_discoro_pulse_coro, Coro): continue if _discoro_computation is not None: asyncoro.logger.debug('invalid "setup" - busy') _discoro_client.send(-1) continue os.chdir(_discoro_dest_path) try: _discoro_computation = _discoro_msg['computation'] exec('import asyncoro.disasyncoro as asyncoro', globals()) if __name__ == '__mp_main__': # Windows multiprocessing process exec('import asyncoro.disasyncoro as asyncoro', sys.modules['__mp_main__'].__dict__) if _discoro_computation._code: exec(_discoro_computation._code, globals()) if __name__ == '__mp_main__': # Windows multiprocessing process exec(_discoro_computation._code, sys.modules['__mp_main__'].__dict__) except: _discoro_computation = None asyncoro.logger.warning('invalid computation') asyncoro.logger.debug(traceback.format_exc()) _discoro_client.send(-1) continue if psutil and _discoro_msg.get('node_status', None): _discoro_node_status = True if isinstance(_discoro_computation.pulse_interval, int) and \ MinPulseInterval <= _discoro_computation.pulse_interval <= MaxPulseInterval: _discoro_computation.pulse_interval = _discoro_computation.pulse_interval else: _discoro_computation.pulse_interval = MinPulseInterval _discoro_timer_coro.resume(_discoro_computation.pulse_interval) _discoro_busy_time = time.time() asyncoro.logger.debug('computation "%s" from %s' % (_discoro_computation._auth, _discoro_msg['client'].location)) _discoro_client.send(0) elif _discoro_req == 'close': _discoro_auth = _discoro_msg.get('auth', None) if not _discoro_computation or (_discoro_auth != _discoro_computation._auth and _discoro_auth != _discoro_config['auth']): continue asyncoro.logger.debug('%s deleting computation "%s"' % (_discoro_coro.location, _discoro_computation._auth)) if _discoro_auth != _discoro_computation._auth and _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerClosed', 'location': _discoro_coro.location}) for _discoro_var in _discoro_job_coros: _discoro_var.terminate() _discoro_job_coros = set() if __name__ == '__mp_main__': # Windows multiprocessing process for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None) globals().update(_discoro_globals) sys.modules['__mp_main__'].__dict__.update(_discoro_globals) else: for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) globals().update(_discoro_globals) for _discoro_var in os.listdir(_discoro_dest_path): _discoro_var = os.path.join(_discoro_dest_path, _discoro_var) if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var): shutil.rmtree(_discoro_var, ignore_errors=True) else: os.remove(_discoro_var) if not os.path.isdir(_discoro_dest_path): try: os.remove(_discoro_dest_path) except: pass os.makedirs(_discoro_dest_path) if not os.path.isfile(_discoro_pid_path): try: if os.path.islink(_discoro_pid_path): os.remove(_discoro_pid_path) else: shutil.rmtree(_discoro_pid_path) with open(_discoro_pid_path, 'w') as _discoro_var: _discoro_var.write('%s' % os.getpid()) except: asyncoro.logger.warning('PID file "%s" is invalid' % _discoro_pid_path) os.chdir(_discoro_dest_path) asyncoro.AsynCoro.instance().dest_path = _discoro_dest_path _discoro_computation = _discoro_client = _discoro_pulse_coro = None _discoro_node_status = None if _discoro_config['serve'] > 0: _discoro_config['serve'] -= 1 if _discoro_config['serve'] == 0: break _discoro_timer_coro.resume(MinPulseInterval) elif _discoro_req == 'node_info': if psutil: info = DiscoroNodeInfo( _discoro_name, _discoro_coro.location.addr, psutil.cpu_count(), psutil.cpu_percent(), {_discoro_var: getattr(psutil.virtual_memory(), _discoro_var) for _discoro_var in ['total', 'percent']}, {_discoro_var: getattr(psutil.disk_usage(_discoro_dest_path), _discoro_var) for _discoro_var in ['total', 'percent']} ) if _discoro_msg.get('node_status', None): _discoro_node_status = True else: info = DiscoroNodeInfo(_discoro_name, _discoro_coro.location.addr, -1, -1, None, None) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(info) elif _discoro_req == 'status': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring info: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: print(' Server %s running %d coroutines for computation at %s' % (_discoro_coro.location, len(_discoro_job_coros), _discoro_pulse_coro.location)) else: print(' Server %s not used by any computation' % (_discoro_coro.location)) elif _discoro_req == 'quit': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring quit: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerClosed', 'location': _discoro_coro.location}) break elif _discoro_req == 'terminate': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring terminate: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerTerminated', 'location': _discoro_coro.location}) if _discoro_computation: msg = {'req': 'close', 'auth': _discoro_computation._auth} _discoro_config['serve'] = 1 _discoro_coro.send(msg) else: break else: asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(-1) # wait until all computations are done; process only 'close' while _discoro_job_coros: _discoro_msg = yield _discoro_coro.receive() if not isinstance(_discoro_msg, dict): continue _discoro_req = _discoro_msg.get('req', None) if _discoro_req == 'close': _discoro_auth = _discoro_msg.get('auth', None) if not _discoro_computation or _discoro_auth != _discoro_computation._auth: continue asyncoro.logger.debug('%s deleting computation "%s"' % (_discoro_coro.location, _discoro_computation._auth)) if __name__ == '__mp_main__': # Windows multiprocessing process for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None) globals().update(_discoro_globals) sys.modules['__mp_main__'].__dict__.update(_discoro_globals) else: for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) globals().update(_discoro_globals) break else: asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(-1) for _discoro_var in os.listdir(_discoro_dest_path): _discoro_var = os.path.join(_discoro_dest_path, _discoro_var) if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var): shutil.rmtree(_discoro_var, ignore_errors=True) else: os.remove(_discoro_var) if os.path.isfile(_discoro_pid_path): os.remove(_discoro_pid_path) _discoro_config['mp_queue'].put(_discoro_config['auth']) asyncoro.logger.debug('discoro server %s quit' % _discoro_coro.location)
class Peer_Remote(): # outbound connections def __init__(self, network_service, remote_ip, remote_port, context=None): self.exit = False self.network_service = network_service self.remote_ip = remote_ip self.remote_port = remote_port self.context = context Coro(self._server_connect) def _server_connect(self, coro=None): try: #logger.debug('CLIENT: connecting to peer at %s:%s', self.remote_ip, str(self.remote_port)) self.outbound_socket = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) self.outbound_socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) # if you're gonna act like UDP yield self.outbound_socket.connect((self.remote_ip, self.remote_port)) #logger.debug('CLIENT: connected to peer at %s:%s', self.remote_ip, str(self.remote_port)) self._send_coro = Coro(self._client_send) #Coro(self._client_recv) # unneeded if we don't utilize bi-directional communication in UDP style messaging self.network_service.on_server_connect(self, self.context) except: show_error() #raise def _client_recv(self, coro=None): while True: try: data = yield self.outbound_socket.recv_msg() if data == None or len(data) == 0 or self.exit: break #logger.debug('CLIENT: received data to peer at %s:%s (Data: %s)', self.remote_ip, str(self.remote_port), data) self.network_service.on_peer_data_received(data) except: show_error() #break #print "Coro(_client_recv) exiting" def _client_send(self, coro=None): coro.set_daemon() while True: try: cmd, state = yield self._send_coro.receive() data, context = state if cmd == NETWORK_PEER_DISCONNECT: self.network_service.on_client_disconnected(context) break #logger.debug('CLIENT: sending data to %s:%s (Data is: %s)', self.remote_ip, self.remote_port,data) yield self.outbound_socket.send_msg(data) self.network_service.on_client_data_sent(context) except: show_error() #break self.outbound_socket.shutdown(socket.SHUT_RDWR) self.outbound_socket.close() #logger.debug('CLIENT: disconnected from %s:%s', self.remote_ip, str(self.remote_port)) #print "Coro(_client_send) exiting" def send(self, data, context): if not self.exit: self._send_coro.send((None, (data, context))) def stop(self, context=None): self.exit = True #logger.debug('CLIENT: disconnecting from %s:%s', self.remote_ip, str(self.remote_port)) self._send_coro.send((NETWORK_PEER_DISCONNECT, (None,context)))
def __init__(self, computation, proc_status=None, proc_available=None, proc_close=None): """'computation' should be an instance of discoro.Computation 'proc_status' if not None should be a generator function that is called (as coroutine) with the status and info, as received by status_coro. If status is ServerInitialized and this function returns non-zero value, the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute' will not use that server. 'proc_available' if not None should be a generator function that is called (as coroutine) with the location of a server process when it becomes available (after all 'depends' of computation have been transferred). The coroutine runs at the client; it can create remote coroutine(s) at the server process, perhaps to setup, such as initializing global variables, transfer additional files etc. The coroutine should exit with 0 to indicate successful setup; any other value is interpretted as failure and not used by scheduler. 'proc_close' if not None should be a generator function that is called (as coroutine) with the status and location of server process when server is about to be closed, or already closed. The coroutine runs at the client; it can create remote coroutine(s) at server process to cleanup, such as delete global variables, transfer files back to client etc. The coroutine is called with two parameters: 'status', which is either 'discoro.Scheduler.ServerInitialized' when server is about to be closed (i.e., server is still available, and remote coroutines can be executed), or 'discoro.Scheduler.ServerClosed' when server is already closed (e.g., due to zombie_period time elapsed without communication, or server was manually closed with command-line etc.), and 'location' of server process. """ if proc_status: if not inspect.isgeneratorfunction(proc_status): asyncoro.logger.warning('Invalid proc_status ignored') proc_status = None if proc_available: if not inspect.isgeneratorfunction(proc_available): asyncoro.logger.warning('Invalid proc_available ignored') proc_available = None if proc_close: if not inspect.isgeneratorfunction(proc_close): asyncoro.logger.warning('Invalid proc_close ignored') proc_close = None self._proc_status = proc_status self._proc_available = proc_available self._proc_close = proc_close self._close_servers = {} self.computation = computation self.computation_sign = None self.status_coro = Coro(self._status_proc) if not computation.status_coro: computation.status_coro = self.status_coro self._rcoros = {} self._rcoros_done = asyncoro.Event() self._askew_results = {} self._servers = {} self._server_avail = asyncoro.Event() Coro(computation.schedule)
def __init__(self, computation, status=None, node_available=None, proc_available=None, proc_close=None): """'computation' should be an instance of discoro.Computation 'status' if not None should be a generator function that is called (as coroutine) with the status and info, as received by status_coro. If status is ServerInitialized and this function returns non-zero value, the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute' will not use that server. 'proc_available' if not None should be a generator function that is called (as coroutine) with the location of a server process when it becomes available (after all 'depends' of computation have been transferred). The coroutine runs at the client; it can create remote coroutine(s) at the server process, perhaps to setup, such as initializing global variables, transfer additional files etc. The coroutine should exit with 0 to indicate successful setup; any other value is interpretted as failure and not used by scheduler. 'proc_close' if not None should be a generator function that is called (as coroutine) with the status and location of server process when server is about to be closed, or already closed. The coroutine runs at the client; it can create remote coroutine(s) at server process to cleanup, such as delete global variables, transfer files back to client etc. The coroutine is called with two parameters: 'status', which is either 'discoro.Scheduler.ServerInitialized' when server is about to be closed (i.e., server is still available, and remote coroutines can be executed), or 'discoro.Scheduler.ServerClosed' when server is already closed (e.g., due to zombie_period time elapsed without communication, or server was manually closed with command-line etc.), and 'location' of server process. """ if status: if not inspect.isgeneratorfunction(status): asyncoro.logger.warning('Invalid status ignored') status = None if proc_available: if not inspect.isgeneratorfunction(proc_available): asyncoro.logger.warning('Invalid proc_available ignored') proc_available = None if proc_close: if not inspect.isgeneratorfunction(proc_close): asyncoro.logger.warning('Invalid proc_close ignored') proc_close = None if not node_available and computation._node_available: node_available = computation._node_available if node_available: if not inspect.isgeneratorfunction(node_available): asyncoro.logger.warning('Invalid node_available ignored') node_available = None self._status = status self._proc_available = proc_available self._proc_close = proc_close self._node_available = node_available self._close_servers = {} self.computation = computation self.computation_sign = None self.status_coro = Coro(self._status_proc) if isinstance(computation.status_coro, Coro): def chain_status_msgs(status_coro, client, coro=None): coro.set_daemon() while True: msg = yield coro.receive() client.send(msg) status_coro.send(msg) computation.status_coro = Coro(chain_status_msgs, self.status_coro, computation.status_coro) else: computation.status_coro = self.status_coro self._rcoros = {} self._rcoros_done = asyncoro.Event() self._askew_results = {} self._servers = {} self._server_avail = asyncoro.Event() self._remote_scheduler = False self.asyncoro = asyncoro.AsynCoro() Coro(computation.schedule)
class Node_Service(Service): exit = False pause_scheduler = False def __init__(self, message_router): super(Node_Service, self).__init__(SERVICE_NODE, message_router, cl_name="ns") self.exit = False self.pause_scheduler = False self.nodes = {} self.queue = deque() self.delay_queue = deque() self.scheduler_thread = Thread(target=self._thread_scheduler) self.scheduler_thread.daemon = True self.scheduler_thread.start() self._stabilize_coro = Coro(self._coro_stabilize) def delay_enqueue(self,message,ms): if not self.pause_scheduler: self.delay_queue.append( (time.time(), ms, message)) def _thread_scheduler(self): try: while not self.exit: requeue = deque() while len(self.delay_queue) > 0 and not self.exit: queued_at, delay_ms, message = self.delay_queue.popleft() if (queued_at + delay_ms / 1000) < time.time(): #self.enqueue(message) self._stabilize_coro.send(message) else: requeue.append((queued_at, delay_ms, message)) while len(requeue) > 0 and not self.exit: self.delay_queue.append(requeue.popleft()) del requeue time.sleep(.1) # 10 ms except: show_error() print "Scheduler thread exiting" def _coro_stabilize(self, coro=None): coro.set_daemon() thread_pool = AsynCoroThreadPool(2 * multiprocessing.cpu_count()) while not self.exit: try: command, context = yield self._stabilize_coro.receive() if self.exit: # fast exit for now (non-graceful) break #command, context = self.queue.popleft() if self.pause_scheduler or context.join_on_stabilize: # just cycle the messages until unpaused if context.join_on_stabilize: context.send_message(Find_Successor_Message(context.thisNode, context.thisNode.key, context.thisNode), context.join_on_stabilize) context.join_on_stabilize = None delay = MAINTENANCE_PERIOD * 3 else: delay = MAINTENANCE_PERIOD self.delay_enqueue((command,context), delay) continue if command == "NODE_STABILIZE": yield thread_pool.async_task(coro, context.begin_stabilize) self.delay_enqueue( ("NODE_CHECK_PREDECESSOR", context), MAINTENANCE_PERIOD) elif command == "NODE_CHECK_PREDECESSOR": yield thread_pool.async_task(coro, context.begin_stabilize) yield thread_pool.async_task(coro, context.check_predecessor) self.delay_enqueue( ("NODE_FIX_FINGERS", context), MAINTENANCE_PERIOD ) elif command == "NODE_FIX_FINGERS": yield thread_pool.async_task(coro, context.fix_fingers, 10) self.delay_enqueue( ("NODE_STABILIZE", context), MAINTENANCE_PERIOD) except: show_error() print "Coro(stabilize) exiting" def stop(self, context=None): self.exit = True self._stabilize_coro.send((None,None)) #for i in range(2 * multiprocessing.cpu_count()): #self.queue.append(('terminate', context)) #self.signal_item_queued.set() def get_console_node(self): if len(self.nodes) > 0: return self.nodes[self.nodes.keys()[0]] def check_scheduler_pause(self,msg): if not self.pause_scheduler: return False if msg.type == Message_Recv_Peer_Data.Type(): msg = msg.network_msg result = False if msg.type == Stabilize_Reply_Message.Type(): result = True elif msg.type == Stablize_Message.Type(): result = True elif msg.type == Check_Predecessor_Message.Type(): result = True elif msg.type == Update_Message.Type(): result = True elif msg.type == Find_Successor_Message.Type(): result = True return result def handle_message(self, msg): if not msg.dest_service == self.service_id: raise Exception("Mismatched service recipient for message.") if msg.type == Message_Setup_Node.Type(): node = Node(self, msg.public_ip, msg.local_ip, msg.local_port) if msg.seeded_peers: for peer in msg.seeded_peers: node.join(peer) # use callback to try next peer if join fails (join is async) else: node.join() self.message_router.route( Message_Start_Server(node.local_ip, node.local_port, Message_Start_Server_Callback(self.service_id, node, True), Message_Start_Server_Callback(self.service_id, node, False))) elif msg.type == Message_Start_Server_Callback.Type(): if msg.result: self.nodes[str(msg.node)] = msg.node self.delay_enqueue(("NODE_STABILIZE", msg.node), 1000) else: msg.node.exit_network() raise Exception( "Unable to successfully start server for node at " + str(msg.node.thisNode)) if msg.type == Message_Forward.Type( ): ni = msg.origin_node if self.nodes.has_key(str(ni)): lnode = self.nodes[str(ni)] # get the Node class this message is addressed to (ip:port) forward_node = lnode.find_ideal_forward(msg.forward_hash) if msg.forward_msg.type == Database_Get_Message.Type() or msg.forward_msg.type == Database_Put_Message.Type(): msg.forward_msg.storage_node = forward_node if forward_node != ni: self.send_message(msg.forward_msg, forward_node) else: self.send_message(msg.forward_msg) elif msg.type == Message_Recv_Peer_Data.Type(): # came off the wire ni = Node_Info(msg.local_ip if len(msg.local_ip) > 0 else "127.0.0.1",msg.local_port) if self.nodes.has_key(str(ni)): lnode = self.nodes[str(ni)] # get the Node class this message is addressed to (ip:port) msg = msg.network_msg rnode = lnode.final_destination(msg) if rnode != lnode.thisNode: self.send_message(msg,rnode) return logger.debug(str(ni) + " received network msg: " + msg.type) if msg.type == Message_Forward.Type( ): lnode.find_ideal_forward(msg.forward_hash) #fix this...we need to do forward the message elif msg.type == Find_Successor_Message.Type(): self.send_message(Update_Message(lnode.thisNode, msg.reply_to.key, msg.finger), msg.reply_to) elif msg.type == Update_Message.Type( ): lnode.update_finger(msg.reply_to, msg.finger) elif msg.type == Check_Predecessor_Message.Type(): self.send_message(Update_Message(lnode.thisNode, msg.reply_to.key, 0), msg.reply_to) elif msg.type == Stablize_Message.Type( ): self.send_message(Stabilize_Reply_Message(lnode.thisNode, msg.reply_to.key, msg.reply_to)) elif msg.type == Stabilize_Reply_Message.Type(): lnode.stabilize(msg) elif msg.type == Notify_Message.Type(): lnode.get_notified(msg) elif msg.type == Exit_Message.Type(): lnode.peer_polite_exit(msg.reply_to) elif msg.type == Database_Put_Message.Type() or msg.type == Database_Get_Message.Type(): msg.storage_node = lnode.thisNode self.message_router.route(msg) elif msg.type == Database_Put_Message_Response.Type() or msg.type == Database_Get_Message_Response.Type(): self.message_router.route(msg) elif msg.type == Message_Console_Command.Type(): self.handle_command(msg.command,msg.args) # wraps message in network packet if not destined for local #def route_node_message(self,msg): # if self.nodes.has_key(str(msg.origin_node)): # node = self.nodes[str(msg.origin_node)] # forward_to = node.find_ideal_forward(msg.destination_key) # if forward_to == node: # self.send_message(msg) # else: # self.send_message(Message_Send_Peer_Data(forward_to, msg.serialize())) def handle_command(self,cmd,args=[]): if cmd == "print": for node in self.nodes.values(): print "successor ", node.thisNode.successor.print_key() print "predecessor", node.thisNode.predecessor.print_key() elif cmd == "pause": self.pause_scheduler = True elif cmd == "resume": self.pause_scheduler = False
class _DispyNode(object): """Internal use only. """ def __init__(self, cpus, ip_addr=None, ext_ip_addr=None, node_port=None, scheduler_node=None, scheduler_port=None, dest_path_prefix='', secret='', keyfile=None, certfile=None, max_file_size=None, zombie_interval=60): assert 0 < cpus <= multiprocessing.cpu_count() self.cpus = cpus if ip_addr: ip_addr = _node_ipaddr(ip_addr) if not ip_addr: raise Exception('invalid ip_addr') else: self.name = socket.gethostname() ip_addr = socket.gethostbyname(self.name) if ext_ip_addr: ext_ip_addr = _node_ipaddr(ext_ip_addr) if not ext_ip_addr: raise Exception('invalid ext_ip_addr') else: ext_ip_addr = ip_addr try: self.name = socket.gethostbyaddr(ext_ip_addr)[0] except: self.name = socket.gethostname() if not node_port: node_port = 51348 if not scheduler_port: scheduler_port = 51347 self.ip_addr = ip_addr self.ext_ip_addr = ext_ip_addr self.scheduler_port = scheduler_port self.pulse_interval = None self.keyfile = keyfile self.certfile = certfile if self.keyfile: self.keyfile = os.path.abspath(self.keyfile) if self.certfile: self.certfile = os.path.abspath(self.certfile) self.asyncoro = AsynCoro() self.tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if self.certfile: self.tcp_sock = ssl.wrap_socket(self.tcp_sock, keyfile=self.keyfile, certfile=self.certfile) self.tcp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.tcp_sock.bind((self.ip_addr, node_port)) self.address = self.tcp_sock.getsockname() self.tcp_sock.listen(30) if dest_path_prefix: self.dest_path_prefix = dest_path_prefix.strip().rstrip(os.sep) else: self.dest_path_prefix = os.path.join(os.sep, 'tmp', 'dispy') if not os.path.isdir(self.dest_path_prefix): os.makedirs(self.dest_path_prefix) os.chmod(self.dest_path_prefix, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) if max_file_size is None: max_file_size = MaxFileSize self.max_file_size = max_file_size self.avail_cpus = self.cpus self.computations = {} self.scheduler_ip_addr = None self.file_uses = {} self.job_infos = {} self.lock = asyncoro.Lock() self.terminate = False self.signature = os.urandom(20).encode('hex') self.auth_code = hashlib.sha1(self.signature + secret).hexdigest() self.zombie_interval = 60 * zombie_interval logger.debug('auth_code for %s: %s', ip_addr, self.auth_code) self.udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.udp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.udp_sock.bind(('', node_port)) logger.info('serving %s cpus at %s:%s', self.cpus, self.ip_addr, node_port) logger.debug('tcp server at %s:%s', self.address[0], self.address[1]) self.udp_sock = AsynCoroSocket(self.udp_sock, blocking=False) scheduler_ip_addr = _node_ipaddr(scheduler_node) self.reply_Q = multiprocessing.Queue() self.reply_Q_thread = threading.Thread(target=self.__reply_Q) self.reply_Q_thread.start() self.timer_coro = Coro(self.timer_task) # self.tcp_coro = Coro(self.tcp_server) self.udp_coro = Coro(self.udp_server, scheduler_ip_addr) def send_pong_msg(self, coro=None): ping_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) ping_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) ping_sock = AsynCoroSocket(ping_sock, blocking=False) pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1], 'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version} pong_msg = 'PONG:' + serialize(pong_msg) yield ping_sock.sendto(pong_msg, ('<broadcast>', self.scheduler_port)) ping_sock.close() def udp_server(self, scheduler_ip_addr, coro=None): assert coro is not None coro.set_daemon() if self.avail_cpus == self.cpus: yield self.send_pong_msg(coro=coro) pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1], 'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version} pong_msg = 'PONG:' + serialize(pong_msg) if scheduler_ip_addr: sock = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) try: yield sock.sendto(pong_msg, (scheduler_ip_addr, self.scheduler_port)) except: logger.warning("Couldn't send ping message to %s:%s", scheduler_ip_addr, self.scheduler_port) finally: sock.close() while True: msg, addr = yield self.udp_sock.recvfrom(1024) # TODO: process each message as separate Coro, so # exceptions are contained? if msg.startswith('PING:'): if self.cpus != self.avail_cpus: logger.debug('Busy (%s/%s); ignoring ping message from %s', self.cpus, self.avail_cpus, addr[0]) continue try: info = unserialize(msg[len('PING:'):]) socket.inet_aton(info['scheduler_ip_addr']) assert isinstance(info['scheduler_port'], int) assert info['version'] == _dispy_version addr = (info['scheduler_ip_addr'], info['scheduler_port']) except: # raise logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue yield self.udp_sock.sendto(pong_msg, addr) elif msg.startswith('PULSE:'): try: info = unserialize(msg[len('PULSE:'):]) assert info['ip_addr'] == self.scheduler_ip_addr yield self.lock.acquire() for compute in self.computations.itervalues(): compute.last_pulse = time.time() yield self.lock.release() except: logger.warning('Ignoring PULSE from %s', addr[0]) elif msg.startswith('SERVERPORT:'): try: req = unserialize(msg[len('SERVERPORT:'):]) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) reply = {'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature, 'version':_dispy_version} sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) yield sock.sendto(serialize(reply), (req['ip_addr'], req['port'])) sock.close() except: logger.debug(traceback.format_exc()) # pass else: logger.warning('Ignoring ping message from %s', addr[0]) def tcp_serve_task(self, conn, addr, coro=None): conn = AsynCoroSocket(conn, blocking=False, keyfile=self.keyfile, certfile=self.certfile) def job_request_task(msg): assert coro is not None try: _job = unserialize(msg) except: logger.debug('Ignoring job request from %s', addr[0]) logger.debug(traceback.format_exc()) raise StopIteration yield self.lock.acquire() compute = self.computations.get(_job.compute_id, None) if compute is not None: if compute.scheduler_ip_addr != self.scheduler_ip_addr: compute = None yield self.lock.release() if self.avail_cpus == 0: logger.warning('All cpus busy') try: yield conn.send_msg('NAK (all cpus busy)') except: pass raise StopIteration elif compute is None: logger.warning('Invalid computation %s', _job.compute_id) try: yield conn.send_msg('NAK (invalid computation %s)' % _job.compute_id) except: pass raise StopIteration reply_addr = (compute.scheduler_ip_addr, compute.job_result_port) logger.debug('New job id %s from %s', _job.uid, addr[0]) files = [] for f in _job.files: tgt = os.path.join(compute.dest_path, os.path.basename(f['name'])) try: fd = open(tgt, 'wb') fd.write(f['data']) fd.close() except: logger.warning('Could not save file "%s"', tgt) continue try: os.utime(tgt, (f['stat'].st_atime, f['stat'].st_mtime)) os.chmod(tgt, stat.S_IMODE(f['stat'].st_mode)) except: logger.debug('Could not set modes for "%s"', tgt) files.append(tgt) _job.files = files if compute.type == _Compute.func_type: reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) args = (job_info, self.certfile, self.keyfile, _job.args, _job.kwargs, self.reply_Q, compute.name, compute.code, compute.dest_path, _job.files) try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration job_info.job_reply.status = DispyJob.Running job_info.proc = multiprocessing.Process(target=_dispy_job_func, args=args) yield self.lock.acquire() self.avail_cpus -= 1 compute.pending_jobs += 1 self.job_infos[_job.uid] = job_info self.lock.release() job_info.proc.start() raise StopIteration elif compute.type == _Compute.prog_type: try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) job_info.job_reply.status = DispyJob.Running yield self.lock.acquire() self.job_infos[_job.uid] = job_info self.avail_cpus -= 1 compute.pending_jobs += 1 yield self.lock.release() prog_thread = threading.Thread(target=self.__job_program, args=(_job, job_info)) prog_thread.start() raise StopIteration else: try: yield conn.send_msg('NAK (invalid computation type "%s")' % compute.type) except: logger.warning('Failed to send response for new job to %s', str(addr)) def add_computation_task(msg): assert coro is not None try: compute = unserialize(msg) except: logger.debug('Ignoring computation request from %s', addr[0]) try: yield conn.send_msg('Invalid computation request') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration yield self.lock.acquire() if not ((self.scheduler_ip_addr is None) or (self.scheduler_ip_addr == compute.scheduler_ip_addr and \ self.scheduler_port == compute.scheduler_port)): logger.debug('Ignoring computation request from %s: %s, %s, %s', compute.scheduler_ip_addr, self.scheduler_ip_addr, self.avail_cpus, self.cpus) self.lock.release() try: yield conn.send_msg('Busy') except: pass raise StopIteration resp = 'ACK' if compute.dest_path and isinstance(compute.dest_path, str): compute.dest_path = compute.dest_path.strip(os.sep) else: for x in xrange(20): compute.dest_path = os.urandom(8).encode('hex') if compute.dest_path.find(os.sep) >= 0: continue if not os.path.isdir(os.path.join(self.dest_path_prefix, compute.dest_path)): break else: logger.warning('Failed to create unique dest_path: %s', compute.dest_path) resp = 'NACK' compute.dest_path = os.path.join(self.dest_path_prefix, compute.dest_path) try: os.makedirs(compute.dest_path) os.chmod(compute.dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) logger.debug('dest_path for "%s": %s', compute.name, compute.dest_path) except: logger.warning('Invalid destination path: "%s"', compute.dest_path) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Invalid dest_path)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration if compute.id in self.computations: logger.warning('Computation "%s" (%s) is being replaced', compute.name, compute.id) setattr(compute, 'last_pulse', time.time()) setattr(compute, 'pending_jobs', 0) setattr(compute, 'pending_results', 0) setattr(compute, 'zombie', False) logger.debug('xfer_files given: %s', ','.join(xf.name for xf in compute.xfer_files)) if compute.type == _Compute.func_type: try: code = compile(compute.code, '<string>', 'exec') except: logger.warning('Computation "%s" could not be compiled', compute.name) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Compilation failed)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration compute.code = marshal.dumps(code) elif compute.type == _Compute.prog_type: assert not compute.code compute.name = os.path.join(compute.dest_path, os.path.basename(compute.name)) xfer_files = [] for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) try: if _same_file(tgt, xf): logger.debug('Ignoring file "%s" / "%s"', xf.name, tgt) if tgt not in self.file_uses: self.file_uses[tgt] = 0 self.file_uses[tgt] += 1 continue except: pass if self.max_file_size and xf.stat_buf.st_size > self.max_file_size: resp = 'NACK (file "%s" too big)' % xf.name else: xfer_files.append(xf) if resp == 'ACK' and ((self.scheduler_ip_addr is not None) and \ (self.scheduler_ip_addr != compute.scheduler_ip_addr)): resp = 'NACK (busy)' if resp == 'ACK': self.computations[compute.id] = compute self.scheduler_ip_addr = compute.scheduler_ip_addr self.scheduler_port = compute.scheduler_port self.pulse_interval = compute.pulse_interval self.lock.release() if xfer_files: resp += ':XFER_FILES:' + serialize(xfer_files) try: yield conn.send_msg(resp) except: assert self.scheduler_ip_addr == compute.scheduler_ip_addr yield self.lock.acquire() del self.computations[compute.id] self.scheduler_ip_addr = None self.scheduler_port = None self.pulse_interval = None self.lock.release() else: self.timer_coro.resume(True) else: self.lock.release() if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) try: yield conn.send_msg(resp) except: pass def xfer_file_task(msg): assert coro is not None try: xf = unserialize(msg) except: logger.debug('Ignoring file trasnfer request from %s', addr[0]) raise StopIteration resp = '' if xf.compute_id not in self.computations: logger.error('computation "%s" is invalid' % xf.compute_id) raise StopIteration tgt = os.path.join(self.computations[xf.compute_id].dest_path, os.path.basename(xf.name)) if os.path.isfile(tgt): if _same_file(tgt, xf): yield self.lock.acquire() if tgt in self.file_uses: self.file_uses[tgt] += 1 else: self.file_uses[tgt] = 1 yield self.lock.release() resp = 'ACK' else: logger.warning('File "%s" already exists with different status as "%s"', xf.name, tgt) if not resp: logger.debug('Copying file %s to %s (%s)', xf.name, tgt, xf.stat_buf.st_size) try: fd = open(tgt, 'wb') n = 0 while n < xf.stat_buf.st_size: data = yield conn.recvall(min(xf.stat_buf.st_size-n, 10240000)) if not data: break fd.write(data) n += len(data) if self.max_file_size and n > self.max_file_size: logger.warning('File "%s" is too big (%s); it is truncated', tgt, n) break fd.close() if n < xf.stat_buf.st_size: resp = 'NAK (read only %s bytes)' % n else: resp = 'ACK' logger.debug('Copied file %s, %s', tgt, resp) os.utime(tgt, (xf.stat_buf.st_atime, xf.stat_buf.st_mtime)) os.chmod(tgt, stat.S_IMODE(xf.stat_buf.st_mode)) self.file_uses[tgt] = 1 except: logger.warning('Copying file "%s" failed with "%s"', xf.name, traceback.format_exc()) resp = 'NACK' try: yield conn.send_msg(resp) except: logger.debug('Could not send reply for "%s"', xf.name) raise StopIteration # xfer_file_task def terminate_job_task(msg): assert coro is not None yield self.lock.acquire() try: _job = unserialize(msg) compute = self.computations[_job.compute_id] assert addr[0] == compute.scheduler_ip_addr job_info = self.job_infos.pop(_job.uid, None) except: logger.debug('Ignoring job request from %s', addr[0]) raise StopIteration finally: self.lock.release() if job_info is None: logger.debug('Job %s completed; ignoring cancel request from %s', _job.uid, addr[0]) raise StopIteration logger.debug('Terminating job %s', _job.uid) job_info.proc.terminate() if isinstance(job_info.proc, multiprocessing.Process): for x in xrange(20): if job_info.proc.is_alive(): yield coro.sleep(0.1) else: logger.debug('Process "%s" for job %s terminated', compute.name, _job.uid) break else: logger.warning('Could not kill process %s', compute.name) raise StopIteration else: assert isinstance(job_info.proc, subprocess.Popen) for x in xrange(20): rc = job_info.proc.poll() logger.debug('Program "%s" for job %s terminated with %s', compute.name, _job.uid, rc) if rc is not None: break if x == 10: logger.debug('Killing job %s', _job.uid) job_info.proc.kill() yield coro.sleep(0.1) else: logger.warning('Could not kill process %s', compute.name) raise StopIteration reply_addr = (addr[0], compute.job_result_port) reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) reply.status = DispyJob.Terminated yield self._send_job_reply(job_info, resending=False, coro=coro) def retrieve_job_task(msg): assert coro is not None try: req = unserialize(msg) assert req['uid'] is not None assert req['hash'] is not None assert req['compute_id'] is not None except: resp = serialize('Invalid job') try: yield conn.send_msg(resp) except: pass raise StopIteration job_info = self.job_infos.get(req['uid'], None) resp = None if job_info is not None: try: yield conn.send_msg(serialize(job_info.job_reply)) ack = yield conn.recv_msg() # no need to check ack except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration for d in os.listdir(self.dest_path_prefix): info_file = os.path.join(self.dest_path_prefix, d, '_dispy_job_reply_%s' % req['uid']) if os.path.isfile(info_file): try: fd = open(info_file, 'rb') job_reply = pickle.load(fd) fd.close() except: job_reply = None if hasattr(job_reply, 'hash') and job_reply.hash == req['hash']: try: yield conn.send_msg(serialize(job_reply)) ack = yield conn.recv_msg() assert ack == 'ACK' except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration try: os.remove(info_file) yield self.lock.acquire() compute = self.computations.get(req['compute_id'], None) if compute is not None: compute.pending_results -= 1 if compute.pending_results == 0: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Could not remove "%s"', info_file) raise StopIteration else: resp = serialize('Invalid job: %s' % req['uid']) if resp: try: yield conn.send_msg(resp) except: pass # tcp_serve_task starts try: req = yield conn.recvall(len(self.auth_code)) assert req == self.auth_code except: logger.warning('Ignoring request; invalid client authentication?') conn.close() raise StopIteration msg = yield conn.recv_msg() if not msg: conn.close() raise StopIteration if msg.startswith('JOB:'): msg = msg[len('JOB:'):] yield job_request_task(msg) conn.close() elif msg.startswith('COMPUTE:'): msg = msg[len('COMPUTE:'):] yield add_computation_task(msg) conn.close() elif msg.startswith('FILEXFER:'): msg = msg[len('FILEXFER:'):] yield xfer_file_task(msg) conn.close() elif msg.startswith('DEL_COMPUTE:'): msg = msg[len('DEL_COMPUTE:'):] try: info = unserialize(msg) compute_id = info['ID'] yield self.lock.acquire() compute = self.computations.get(compute_id, None) if compute is None: logger.warning('Computation "%s" is not valid', compute_id) else: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Deleting computation failed with %s', traceback.format_exc()) # raise conn.close() elif msg.startswith('TERMINATE_JOB:'): msg = msg[len('TERMINATE_JOB:'):] yield terminate_job_task(msg) conn.close() elif msg.startswith('RETRIEVE_JOB:'): msg = msg[len('RETRIEVE_JOB:'):] yield retrieve_job_task(msg) conn.close() else: logger.warning('Invalid request "%s" from %s', msg[:min(10, len(msg))], addr[0]) resp = 'NAK (invalid command: %s)' % (msg[:min(10, len(msg))]) try: yield conn.send_msg(resp) except: logger.warning('Failed to send reply to %s', str(addr)) conn.close() def timer_task(self, coro=None): coro.set_daemon() reset = True last_pulse_time = last_zombie_time = time.time() while True: if reset: if self.pulse_interval and self.zombie_interval: timeout = min(self.pulse_interval, self.zombie_interval) self.zombie_interval = max(5 * self.pulse_interval, self.zombie_interval) else: timeout = max(self.pulse_interval, self.zombie_interval) self.zombie_interval = self.zombie_interval reset = yield coro.suspend(timeout) now = time.time() if self.pulse_interval and (now - last_pulse_time) >= self.pulse_interval: n = self.cpus - self.avail_cpus assert n >= 0 if n > 0 and self.scheduler_ip_addr: last_pulse_time = now msg = 'PULSE:' + serialize({'ip_addr':self.ext_ip_addr, 'port':self.udp_sock.getsockname()[1], 'cpus':n}) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) yield sock.sendto(msg, (self.scheduler_ip_addr, self.scheduler_port)) sock.close() if self.zombie_interval and (now - last_zombie_time) >= self.zombie_interval: last_zombie_time = now yield self.lock.acquire() for compute in self.computations.itervalues(): if (now - compute.last_pulse) > self.zombie_interval: compute.zombie = True zombies = [compute for compute in self.computations.itervalues() \ if compute.zombie and compute.pending_jobs == 0] for compute in zombies: logger.debug('Deleting zombie computation "%s"', compute.name) self.cleanup_computation(compute) phoenix = [compute for compute in self.computations.itervalues() \ if not compute.zombie and compute.pending_results] for compute in phoenix: files = [f for f in os.listdir(compute.dest_path) \ if f.startswith('_dispy_job_reply_')] # limit number queued so as not to take up too much time files = files[:min(len(files), 128)] for f in files: result_file = os.path.join(compute.dest_path, f) try: fd = open(result_file, 'rb') job_result = pickle.load(fd) fd.close() except: logger.debug('Could not load "%s"', result_file) logger.debug(traceback.format_exc()) continue try: os.remove(result_file) except: logger.debug('Could not remove "%s"', result_file) compute.pending_results -= 1 job_info = _DispyJobInfo(job_result, (compute.scheduler_ip_addr, compute.job_result_port), compute) Coro(self._send_job_reply, job_info, resending=True) self.lock.release() for compute in zombies: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr) data = serialize({'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature}) yield sock.sendto('TERMINATED:%s' % data, (compute.scheduler_ip_addr, compute.scheduler_port)) sock.close() if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus: self.pulse_interval = None reset = True yield self.send_pong_msg(coro=coro) def __job_program(self, _job, job_info): compute = self.computations[_job.compute_id] program = [compute.name] args = unserialize(_job.args) program.extend(args) logger.debug('Executing "%s"', str(program)) reply = job_info.job_reply try: os.chdir(compute.dest_path) env = {} env.update(os.environ) env['PATH'] = compute.dest_path + ':' + env['PATH'] job_info.proc = subprocess.Popen(program, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) assert isinstance(job_info.proc, subprocess.Popen) reply.stdout, reply.stderr = job_info.proc.communicate() reply.result = job_info.proc.returncode reply.status = DispyJob.Finished except: logger.debug('Executing %s failed with %s', str(program), str(sys.exc_info())) reply.exception = traceback.format_exc() reply.status = DispyJob.Terminated self.reply_Q.put(reply) def __reply_Q(self): while True: job_reply = self.reply_Q.get() if job_reply is None: break job_info = self.job_infos.pop(job_reply.uid, None) if job_info is not None: if job_info.proc is not None: if isinstance(job_info.proc, multiprocessing.Process): job_info.proc.join(2) else: job_info.proc.wait() job_info.job_reply = job_reply Coro(self._send_job_reply, job_info, resending=False).value() def _send_job_reply(self, job_info, resending=False, coro=None): """Internal use only. """ assert coro is not None job_reply = job_info.job_reply logger.debug('Sending result for job %s (%s) to %s', job_reply.uid, job_reply.status, str(job_info.reply_addr)) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock = AsynCoroSocket(sock, blocking=False, certfile=self.certfile, keyfile=self.keyfile) sock.settimeout(2) try: yield sock.connect(job_info.reply_addr) yield sock.send_msg(serialize(job_reply)) ack = yield sock.recv_msg() assert ack == 'ACK' except: logger.error("Couldn't send results for %s to %s", job_reply.uid, str(job_info.reply_addr)) # store job result even if computation has not enabled # fault recovery; user may be able to access node and # retrieve result manually f = os.path.join(job_info.compute_dest_path, '_dispy_job_reply_%s' % job_reply.uid) logger.debug('storing results for job %s', job_reply.uid) try: fd = open(f, 'wb') pickle.dump(job_reply, fd) fd.close() except: logger.debug('Could not save results for job %s', job_reply.uid) else: yield self.lock.acquire() compute = self.computations.get(job_info.compute_id, None) if compute is not None: compute.pending_results += 1 self.lock.release() finally: sock.close() if not resending: yield self.lock.acquire() self.avail_cpus += 1 compute = self.computations.get(job_info.compute_id, None) if compute is None: logger.warning('Computation for %s / %s is invalid!', job_reply.uid, job_info.compute_id) else: # technically last_pulse should be updated only # when successfully sent reply, but no harm if done # otherwise, too compute.last_pulse = time.time() compute.pending_jobs -= 1 if compute.pending_jobs == 0 and compute.zombie: self.cleanup_computation(compute) self.lock.release() def cleanup_computation(self, compute): # called with lock held if not compute.zombie: return if compute.pending_jobs != 0: logger.debug('pending jobs for computation "%s"/%s: %s', compute.name, compute.id, compute.pending_jobs) if compute.pending_jobs > 0: return del self.computations[compute.id] if compute.scheduler_ip_addr == self.scheduler_ip_addr and \ all(c.scheduler_ip_addr != self.scheduler_ip_addr \ for c in self.computations.itervalues()): assert self.avail_cpus == self.cpus self.scheduler_ip_addr = None self.pulse_interval = None if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus: self.timer_coro.resume(True) Coro(self.send_pong_msg) if compute.cleanup is False: return for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) if tgt not in self.file_uses: logger.debug('File "%s" is unknown', tgt) continue self.file_uses[tgt] -= 1 if self.file_uses[tgt] == 0: del self.file_uses[tgt] if tgt == xf: logger.debug('Not removing file "%s"', xf.name) else: logger.debug('Removing file "%s"', tgt) try: os.remove(tgt) if os.path.splitext(tgt)[1] == '.py' and os.path.isfile(tgt + 'c'): os.remove(tgt + 'c') except: logger.warning('Could not remove file "%s"', tgt) if os.path.isdir(compute.dest_path) and \ compute.dest_path.startswith(self.dest_path_prefix) and \ len(compute.dest_path) > len(self.dest_path_prefix) and \ len(os.listdir(compute.dest_path)) == 0: logger.debug('Removing "%s"', compute.dest_path) try: os.rmdir(compute.dest_path) except: logger.warning('Could not remove directory "%s"', compute.dest_path) def shutdown(self): def _shutdown(self, coro=None): assert coro is not None yield self.lock.acquire() job_infos = self.job_infos self.job_infos = {} computations = self.computations.items() self.computations = {} if self.reply_Q: self.reply_Q.put(None) self.lock.release() for uid, job_info in job_infos.iteritems(): job_info.proc.terminate() logger.debug('process for %s is killed', uid) if isinstance(job_info.proc, multiprocessing.Process): job_info.proc.join(2) else: job_info.proc.wait() for cid, compute in computations: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(2) logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr) data = serialize({'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature}) yield sock.sendto('TERMINATED:' + data, (compute.scheduler_ip_addr, compute.scheduler_port)) sock.close() Coro(_shutdown, self).value() self.asyncoro.join() self.asyncoro.terminate()