Esempio n. 1
0
    def communicate(self, input=None):
        """Similar to Popen's communicate. Must be used with 'yield' as
        'stdout, stderr = yield async_pipe.communicate()'

        'input' must be either data or an object with 'read' method
        (i.e., regular file object or AsyncFile object).
        """
        def write_proc(fd, input, coro=None):
            size = 16384
            if isinstance(input, str) or isinstance(input, bytes):
                n = yield fd.write(input, full=True)
                if n != len(input):
                    raise IOError('write failed')
            else:
                # TODO: how to know if 'input' is file object for
                # on-disk file?
                if hasattr(input, 'seek') and hasattr(input, 'fileno'):
                    read_func = partial_func(os.read, input.fileno())
                else:
                    read_func = input.read
                while True:
                    data = yield read_func(size)
                    if not data:
                        break
                    if isinstance(data, str):
                        data = data.encode()
                    n = yield fd.write(data, full=True)
                    if n != len(data):
                        raise IOError('write failed')
                input.close()
            fd.close()

        def read_proc(fd, coro=None):
            size = 16384
            buflist = []
            while True:
                buf = yield fd.read(size)
                if not buf:
                    break
                buflist.append(buf)
            fd.close()
            data = b''.join(buflist)
            raise StopIteration(data)

        if self.stdout:
            stdout_coro = Coro(read_proc, self.stdout)
        if self.stderr:
            stderr_coro = Coro(read_proc, self.stderr)
        if input and self.stdin:
            stdin_coro = Coro(write_proc, self.stdin, input)
            yield stdin_coro.finish()

        raise StopIteration(
            (yield stdout_coro.finish()) if self.stdout else None,
            (yield stderr_coro.finish()) if self.stderr else None)
Esempio n. 2
0
    def map_results(self, gen, iter):
        """Execute generator 'gen' with arguments from given iterable. The
        return value is list of results that correspond to executing 'gen' with
        arguments in iterable in the same order.

        Must be used with 'yield', as for example,
        'results = yield scheduler.map_results(generator, list_of_tuples)'.
        """
        def exec_proc(gen, *args):
            yield self.execute(gen, *args)

        coros = []
        append_coro = coros.append
        for params in iter:
            if not isinstance(params, tuple):
                if hasattr(params, '__iter__'):
                    params = tuple(params)
                else:
                    params = (params,)
            append_coro(Coro(exec_proc, gen, *params))
        results = [None] * len(coros)
        for i, coro in enumerate(coros):
            result = yield coro.finish()
            results[i] = result
        raise StopIteration(results)
Esempio n. 3
0
    def finish(self, close=False):
        """Wait until all scheduled coroutines finish. If 'close' is True, the
        computation is closed as well.

        Must be used with 'yield' as 'yield job_scheduler.finish()'.
        """

        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            if self._proc_close:
                coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location)
                         for location in self._close_servers]
                self._close_servers = {}
                for coro in coros:
                    yield coro.finish()
            else:
                self._close_servers = {}
        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            yield self.computation.close()
            self._askew_results.clear()
Esempio n. 4
0
 def setup_proc(self, msg, coro=None):
     if self._remote_scheduler:
         yield self.asyncoro.peer(msg.info)
     if (yield Coro(self._proc_available,
                    msg.info).finish()) == 0:
         self._close_servers[msg.info] = msg.info
         self._servers[msg.info] = msg.info
         self._server_avail.set()
Esempio n. 5
0
    def __init__(self):
        self._services = {}

        logger.setLevel(logging.INFO)

        # thread pool -- will burn up if services use the thread for blocking & totally
        # kill application communication. If you have to block for I/O then you better
        # be using async in the destination service
        #for i in range(2 * multiprocessing.cpu_count()) :
        #self._coro_dispatcher = \
        self._dispatcher_coro = Coro(self._message_dispatcher)
Esempio n. 6
0
def coro1(coro=None):
    # if server is on remote network, automatic discovery won't work,
    # so add it explicitly
    # yield scheduler.peer('192.168.21.5')

    # find where 'rci_1' is registered; alternately, location can be
    # explicitly created with asyncoro.Location or obtained with
    # 'locate_peer' etc.
    rloc = yield scheduler.locate_RCI('rci_1', timeout=2)
    if not rloc:
        raise Exception('failed')
    n = 5
    monitor = Coro(monitor_proc, n)
    for x in range(n):
        rcoro = yield scheduler.run_RCI(rloc, 'rci_1', 'test%s' % x, b=x)
        logger.debug('rcoro: %s/%s', rcoro.name, rcoro._id)
        # set 'm' as monitor for this coroutine
        yield monitor.monitor(rcoro)
        # send credentials
        rcoro.send('msg:%s' % x)
        yield coro.sleep(random.uniform(0, 1))
Esempio n. 7
0
    def __init__(self, message_router):
        super(Node_Service, self).__init__(SERVICE_NODE, message_router, cl_name="ns")
        self.exit = False
        self.pause_scheduler = False
        self.nodes = {}
        self.queue = deque()
        self.delay_queue = deque()

        self.scheduler_thread = Thread(target=self._thread_scheduler)
        self.scheduler_thread.daemon = True
        self.scheduler_thread.start()

        self._stabilize_coro = Coro(self._coro_stabilize)
Esempio n. 8
0
    def _server_connect(self, coro=None):
        try:
            #logger.debug('CLIENT: connecting to peer at %s:%s', self.remote_ip, str(self.remote_port))
            self.outbound_socket = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_STREAM))
            self.outbound_socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) # if you're gonna act like UDP

            yield self.outbound_socket.connect((self.remote_ip, self.remote_port))
            #logger.debug('CLIENT: connected to peer at %s:%s', self.remote_ip, str(self.remote_port))
            self._send_coro = Coro(self._client_send)
            #Coro(self._client_recv) # unneeded if we don't utilize bi-directional communication in UDP style messaging

            self.network_service.on_server_connect(self, self.context)
        except:
            show_error()
Esempio n. 9
0
    def communicate(self, input=None):
        """Similar to Popen's communicate. Must be used with 'yield' as
        'stdout, stderr = yield async_pipe.communicate()'

        'input' must be either data or an object with 'read' method
        (i.e., regular file object or AsyncFile object).
        """
        def write_proc(fd, input, coro=None):
            size = 16384
            if isinstance(input, str) or isinstance(input, bytes):
                n = yield fd.write(input, full=True)
                if n != len(input):
                    raise IOError('write failed')
            else:
                # TODO: how to know if 'input' is file object for
                # on-disk file?
                if hasattr(input, 'seek') and hasattr(input, 'fileno'):
                    read_func = partial_func(os.read, input.fileno())
                else:
                    read_func = input.read
                while 1:
                    data = yield read_func(size)
                    if not data:
                        break
                    if isinstance(data, str):
                        data = data.encode()
                    n = yield fd.write(data, full=True)
                    if n != len(data):
                        raise IOError('write failed')
                input.close()
            fd.close()

        def read_proc(fd, coro=None):
            size = 16384
            buflist = []
            while 1:
                buf = yield fd.read(size)
                if not buf:
                    break
                buflist.append(buf)
            fd.close()
            data = b''.join(buflist)
            raise StopIteration(data)

        if self.stdout:
            stdout_coro = Coro(read_proc, self.stdout)
        if self.stderr:
            stderr_coro = Coro(read_proc, self.stderr)
        if input and self.stdin:
            stdin_coro = Coro(write_proc, self.stdin, input)
            yield stdin_coro.finish()

        raise StopIteration((yield stdout_coro.finish()) if self.stdout else None,
                            (yield stderr_coro.finish()) if self.stderr else None)
Esempio n. 10
0
 def status_proc(self, msg, coro=None):
     if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0:
         self._servers[msg.info] = msg.info
         self._server_avail.set()
Esempio n. 11
0
 def setup_proc(self, msg, coro=None):
     if (yield Coro(self._proc_available, msg.info).finish()) == 0:
         self._close_servers[msg.info] = msg.info
         self._servers[msg.info] = msg.info
         self._server_avail.set()
Esempio n. 12
0
    def _status_proc(self, coro=None):
        """Internal use only. Coroutine to process discoro scheduler messages.
        """

        coro.set_daemon()
        coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value())
        while 1:
            msg = yield coro.receive()
            if isinstance(msg, asyncoro.MonitorException):
                if msg.args[1][0] == discoro.Scheduler.ServerClosed:
                    continue
                rcoro = msg.args[0]
                client, use_count = self._rcoros.pop(rcoro, ('missing', 0))
                if client is None:
                    pass
                elif isinstance(client, Coro):
                    client._proceed_(msg.args[1][1])
                elif client == 'missing':
                    # Due to 'yield' used to create rcoro, scheduler may not
                    # have updated self._rcoros before the coroutine's
                    # MonitorException is received, so put it in
                    # 'askew_results'. The scheduling coroutine will resend it
                    # when it receives rcoro
                    self._askew_results[rcoro] = msg
                    continue

                else:
                    asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored')
                    continue
                if use_count:
                    self._servers[rcoro.location] = rcoro.location
                    self._server_avail.set()
                if not self._rcoros:
                    self._rcoros_done.set()

            elif isinstance(msg, DiscoroStatus):
                if msg.status == discoro.Scheduler.ServerInitialized:
                    if self._proc_available:
                        def setup_proc(self, msg, coro=None):
                            if (yield Coro(self._proc_available, msg.info).finish()) == 0:
                                self._close_servers[msg.info] = msg.info
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(setup_proc, self, msg)
                    elif self._proc_status:
                        def status_proc(self, msg, coro=None):
                            if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0:
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(status_proc, self, msg)
                    else:
                        self._servers[msg.info] = msg.info
                        self._server_avail.set()

                elif msg.status == discoro.Scheduler.ServerClosed:
                    self._servers.pop(msg.info, None)
                    if self._close_servers.pop(msg.info, None) and self._proc_close:
                        Coro(self._proc_close, msg.status, msg.info)
                    elif self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                elif msg.status == discoro.Scheduler.ComputationScheduled:
                    self.computation_sign = msg.info
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                elif (msg.status == discoro.Scheduler.ComputationClosed and
                      msg.info == self.computation_sign):
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                    raise StopIteration
                elif msg.status != discoro.Scheduler.CoroCreated:
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
Esempio n. 13
0
class RemoteCoroScheduler(object):
    """Scheduler for submitting computation jobs.

    When coroutines are created with 'run' methods of Computation instances,
    they are created with a load-balancing algorithm on available servers with
    no limit on how many coroutines are run at a server. This works when
    coroutines are not CPU bound always. If, however, coroutines are
    computations (CPU bound always/mostly), then it may be more appropriate to
    schedule one coroutine at a server so creating a new coroutine waits until a
    server becomes available.

    RemoteCoroScheduler schedules at most one computation (coroutine) at a
    server process at any time (so a node may execute as many computation
    coroutines as there are server processes running on that node, but not
    more).

    See 'discomp*.py' files in 'examples' directory for some use cases.

    NB: When using this scheduler, 'run' method of computation shouldn't be used
    to create (remote) coroutines (unless those don't take up CPU), as this
    scheduler is not aware of those.
    """

    def __init__(self, computation, proc_status=None, proc_available=None, proc_close=None):
        """'computation' should be an instance of discoro.Computation

        'proc_status' if not None should be a generator function that is called
        (as coroutine) with the status and info, as received by status_coro. If
        status is ServerInitialized and this function returns non-zero value,
        the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute'
        will not use that server.

        'proc_available' if not None should be a generator function that is
        called (as coroutine) with the location of a server process when it
        becomes available (after all 'depends' of computation have been
        transferred). The coroutine runs at the client; it can create remote
        coroutine(s) at the server process, perhaps to setup, such as
        initializing global variables, transfer additional files etc. The
        coroutine should exit with 0 to indicate successful setup; any other
        value is interpretted as failure and not used by scheduler.

        'proc_close' if not None should be a generator function that is called
        (as coroutine) with the status and location of server process when
        server is about to be closed, or already closed. The coroutine runs at
        the client; it can create remote coroutine(s) at server process to
        cleanup, such as delete global variables, transfer files back to client
        etc. The coroutine is called with two parameters: 'status', which is
        either 'discoro.Scheduler.ServerInitialized' when server is about to be
        closed (i.e., server is still available, and remote coroutines can be
        executed), or 'discoro.Scheduler.ServerClosed' when server is already
        closed (e.g., due to zombie_period time elapsed without communication,
        or server was manually closed with command-line etc.), and 'location' of
        server process.
        """

        if proc_status:
            if not inspect.isgeneratorfunction(proc_status):
                asyncoro.logger.warning('Invalid proc_status ignored')
                proc_status = None
        if proc_available:
            if not inspect.isgeneratorfunction(proc_available):
                asyncoro.logger.warning('Invalid proc_available ignored')
                proc_available = None
        if proc_close:
            if not inspect.isgeneratorfunction(proc_close):
                asyncoro.logger.warning('Invalid proc_close ignored')
                proc_close = None

        self._proc_status = proc_status
        self._proc_available = proc_available
        self._proc_close = proc_close
        self._close_servers = {}

        self.computation = computation
        self.computation_sign = None
        self.status_coro = Coro(self._status_proc)
        if not computation.status_coro:
            computation.status_coro = self.status_coro
        self._rcoros = {}
        self._rcoros_done = asyncoro.Event()
        self._askew_results = {}
        self._servers = {}
        self._server_avail = asyncoro.Event()
        Coro(computation.schedule)

    def schedule(self, gen, *args, **kwargs):
        """Similar to 'run' method of computation, except as noted above: This
        method will block until a server process is available (i.e., not running
        another computation).

        Must be used with 'yield', similar to 'run' method of Computation
        instance.
        """

        while not self._servers:
            self._server_avail.clear()
            yield self._server_avail.wait()
        sloc, loc = self._servers.popitem()
        rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            self._rcoros[rcoro] = (None, 1)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
        else:
            self._servers[sloc] = loc
            self._server_avail.set()
        raise StopIteration(rcoro)

    def execute(self, gen, *args, **kwargs):
        """Similar to 'run' method of computation, except as noted above: The
        caller (client coroutine) will block until a server process is available
        (i.e., not running another computation), where remote coroutine with
        given 'gen', 'args' and 'kwargs' runs and finishes. The return value is
        the result of computation.

        Must be used with 'yield', similar to 'run' method of Computation
        instance.
        """

        while not self._servers:
            self._server_avail.clear()
            yield self._server_avail.wait()
        sloc, loc = self._servers.popitem()
        rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            client = asyncoro.AsynCoro.cur_coro()
            self._rcoros[rcoro] = (client, 1)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
            client._await_()
        else:
            self._servers[sloc] = loc
            self._server_avail.set()
            raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro)))

    def execute_at(self, where, gen, *args, **kwargs):
        """Similar to 'run_at' method of computation, except the calling
        coroutine is blocked until the computation finishes and exit value of
        computation is returned. Unlike 'execute', the computation is executed
        right away, even if remote server process is executing another
        computation.

        Must be used with 'yield', similar to 'run_at' method of Computation
        instance.
        """

        rcoro = yield self.computation.run_at(where, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            client = asyncoro.AsynCoro.cur_coro()
            self._rcoros[rcoro] = (client, 0)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
            client._await_()
        else:
            raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro)))

    def map_results(self, gen, iter):
        """Execute generator 'gen' with arguments from given iterable. The
        return value is list of results that correspond to executing 'gen' with
        arguments in iterable in the same order.

        Must be used with 'yield', as for example,
        'results = yield scheduler.map_results(generator, list_of_tuples)'.
        """
        def exec_proc(gen, *args):
            yield self.execute(gen, *args)

        coros = []
        append_coro = coros.append
        for params in iter:
            if not isinstance(params, tuple):
                if hasattr(params, '__iter__'):
                    params = tuple(params)
                else:
                    params = (params,)
            append_coro(Coro(exec_proc, gen, *params))
        results = [None] * len(coros)
        for i, coro in enumerate(coros):
            result = yield coro.finish()
            results[i] = result
        raise StopIteration(results)

    def submit_at(self, where, gen, *args, **kwargs):
        """Similar to 'run_at' method of computation. If 'where' is None, the
        calling coroutine is blocked until any server is discovered and
        initialized (so computation's 'run_at' will not fail). Unlike
        'schedule', this method doesn't wait for server to be free (i.e., not
        running any other coroutines), nor unlike 'execute_at', the caller is
        not blocked until the coroutine finishes.

        Must be used with 'yield', similar to 'run_at' method of Computation
        instance. The value returned is result of 'run_at' method of computation
        (reference to remote coroutine in case of success, and error otherwise).
        """
        if not where:
            if not self._servers and not self._rcoros:
                yield self._server_avail.wait()
        rcoro = yield self.computation.run_at(where, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            self._rcoros[rcoro] = (None, 0)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
        raise StopIteration(rcoro)

    def submit(self, gen, *args, **kwargs):
        """Submit coroutine at any server; see 'submit_at' above.
        """
        yield self.submit_at(None, gen, *args, **kwargs)

    def finish(self, close=False):
        """Wait until all scheduled coroutines finish. If 'close' is True, the
        computation is closed as well.

        Must be used with 'yield' as 'yield job_scheduler.finish()'.
        """

        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            if self._proc_close:
                coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location)
                         for location in self._close_servers]
                self._close_servers = {}
                for coro in coros:
                    yield coro.finish()
            else:
                self._close_servers = {}
        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            yield self.computation.close()
            self._askew_results.clear()

    def _status_proc(self, coro=None):
        """Internal use only. Coroutine to process discoro scheduler messages.
        """

        coro.set_daemon()
        coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value())
        while 1:
            msg = yield coro.receive()
            if isinstance(msg, asyncoro.MonitorException):
                if msg.args[1][0] == discoro.Scheduler.ServerClosed:
                    continue
                rcoro = msg.args[0]
                client, use_count = self._rcoros.pop(rcoro, ('missing', 0))
                if client is None:
                    pass
                elif isinstance(client, Coro):
                    client._proceed_(msg.args[1][1])
                elif client == 'missing':
                    # Due to 'yield' used to create rcoro, scheduler may not
                    # have updated self._rcoros before the coroutine's
                    # MonitorException is received, so put it in
                    # 'askew_results'. The scheduling coroutine will resend it
                    # when it receives rcoro
                    self._askew_results[rcoro] = msg
                    continue

                else:
                    asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored')
                    continue
                if use_count:
                    self._servers[rcoro.location] = rcoro.location
                    self._server_avail.set()
                if not self._rcoros:
                    self._rcoros_done.set()

            elif isinstance(msg, DiscoroStatus):
                if msg.status == discoro.Scheduler.ServerInitialized:
                    if self._proc_available:
                        def setup_proc(self, msg, coro=None):
                            if (yield Coro(self._proc_available, msg.info).finish()) == 0:
                                self._close_servers[msg.info] = msg.info
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(setup_proc, self, msg)
                    elif self._proc_status:
                        def status_proc(self, msg, coro=None):
                            if (yield Coro(self._proc_status, msg.status, msg.info).finish()) == 0:
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(status_proc, self, msg)
                    else:
                        self._servers[msg.info] = msg.info
                        self._server_avail.set()

                elif msg.status == discoro.Scheduler.ServerClosed:
                    self._servers.pop(msg.info, None)
                    if self._close_servers.pop(msg.info, None) and self._proc_close:
                        Coro(self._proc_close, msg.status, msg.info)
                    elif self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                elif msg.status == discoro.Scheduler.ComputationScheduled:
                    self.computation_sign = msg.info
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                elif (msg.status == discoro.Scheduler.ComputationClosed and
                      msg.info == self.computation_sign):
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
                    raise StopIteration
                elif msg.status != discoro.Scheduler.CoroCreated:
                    if self._proc_status:
                        Coro(self._proc_status, msg.status, msg.info)
Esempio n. 14
0
class RemoteCoroScheduler(object):
    """Scheduler for submitting computation jobs.

    When coroutines are created with 'run' methods of Computation instances,
    they are created with a load-balancing algorithm on available servers with
    no limit on how many coroutines are run at a server. This works when
    coroutines are not CPU bound always. If, however, coroutines are
    computations (CPU bound always/mostly), then it may be more appropriate to
    schedule one coroutine at a server so creating a new coroutine waits until a
    server becomes available.

    RemoteCoroScheduler schedules at most one computation (coroutine) at a
    server process at any time (so a node may execute as many computation
    coroutines as there are server processes running on that node, but not
    more).

    See 'discomp*.py' files in 'examples' directory for some use cases.

    NB: When using this scheduler, 'run' method of computation shouldn't be used
    to create (remote) coroutines (unless those don't take up CPU), as this
    scheduler is not aware of those.
    """

    def __init__(self, computation, status=None, node_available=None,
                 proc_available=None, proc_close=None):
        """'computation' should be an instance of discoro.Computation

        'status' if not None should be a generator function that is called
        (as coroutine) with the status and info, as received by status_coro. If
        status is ServerInitialized and this function returns non-zero value,
        the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute'
        will not use that server.

        'proc_available' if not None should be a generator function that is
        called (as coroutine) with the location of a server process when it
        becomes available (after all 'depends' of computation have been
        transferred). The coroutine runs at the client; it can create remote
        coroutine(s) at the server process, perhaps to setup, such as
        initializing global variables, transfer additional files etc. The
        coroutine should exit with 0 to indicate successful setup; any other
        value is interpretted as failure and not used by scheduler.

        'proc_close' if not None should be a generator function that is called
        (as coroutine) with the status and location of server process when
        server is about to be closed, or already closed. The coroutine runs at
        the client; it can create remote coroutine(s) at server process to
        cleanup, such as delete global variables, transfer files back to client
        etc. The coroutine is called with two parameters: 'status', which is
        either 'discoro.Scheduler.ServerInitialized' when server is about to be
        closed (i.e., server is still available, and remote coroutines can be
        executed), or 'discoro.Scheduler.ServerClosed' when server is already
        closed (e.g., due to zombie_period time elapsed without communication,
        or server was manually closed with command-line etc.), and 'location' of
        server process.
        """

        if status:
            if not inspect.isgeneratorfunction(status):
                asyncoro.logger.warning('Invalid status ignored')
                status = None
        if proc_available:
            if not inspect.isgeneratorfunction(proc_available):
                asyncoro.logger.warning('Invalid proc_available ignored')
                proc_available = None
        if proc_close:
            if not inspect.isgeneratorfunction(proc_close):
                asyncoro.logger.warning('Invalid proc_close ignored')
                proc_close = None
        if not node_available and computation._node_available:
            node_available = computation._node_available
        if node_available:
            if not inspect.isgeneratorfunction(node_available):
                asyncoro.logger.warning('Invalid node_available ignored')
                node_available = None

        self._status = status
        self._proc_available = proc_available
        self._proc_close = proc_close
        self._node_available = node_available
        self._close_servers = {}

        self.computation = computation
        self.computation_sign = None
        self.status_coro = Coro(self._status_proc)
        if isinstance(computation.status_coro, Coro):
            def chain_status_msgs(status_coro, client, coro=None):
                coro.set_daemon()
                while True:
                    msg = yield coro.receive()
                    client.send(msg)
                    status_coro.send(msg)
            computation.status_coro = Coro(chain_status_msgs, self.status_coro,
                                           computation.status_coro)
        else:
            computation.status_coro = self.status_coro
        self._rcoros = {}
        self._rcoros_done = asyncoro.Event()
        self._askew_results = {}
        self._servers = {}
        self._server_avail = asyncoro.Event()
        self._remote_scheduler = False
        self.asyncoro = asyncoro.AsynCoro()
        Coro(computation.schedule)

    def schedule(self, gen, *args, **kwargs):
        """Similar to 'run' method of computation, except as noted above: This
        method will block until a server process is available (i.e., not running
        another computation).

        Must be used with 'yield', similar to 'run' method of Computation
        instance.
        """

        while not self._servers:
            self._server_avail.clear()
            yield self._server_avail.wait()
        sloc, loc = self._servers.popitem()
        rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            self._rcoros[rcoro] = (None, 1)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
        else:
            self._servers[sloc] = loc
            self._server_avail.set()
        raise StopIteration(rcoro)

    def execute(self, gen, *args, **kwargs):
        """Similar to 'run' method of computation, except as noted above: The
        caller (client coroutine) will block until a server process is available
        (i.e., not running another computation), where remote coroutine with
        given 'gen', 'args' and 'kwargs' runs and finishes. The return value is
        the result of computation.

        Must be used with 'yield', similar to 'run' method of Computation
        instance.
        """

        while not self._servers:
            self._server_avail.clear()
            yield self._server_avail.wait()
        sloc, loc = self._servers.popitem()
        rcoro = yield self.computation.run_at(loc, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            client = asyncoro.AsynCoro.cur_coro()
            self._rcoros[rcoro] = (client, 1)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
            client._await_()
        else:
            self._servers[sloc] = loc
            self._server_avail.set()
            raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro)))

    def execute_at(self, where, gen, *args, **kwargs):
        """Similar to 'run_at' method of computation, except the calling
        coroutine is blocked until the computation finishes and exit value of
        computation is returned. Unlike 'execute', the computation is executed
        right away, even if remote server process is executing another
        computation.

        Must be used with 'yield', similar to 'run_at' method of Computation
        instance.
        """

        rcoro = yield self.computation.run_at(where, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            client = asyncoro.AsynCoro.cur_coro()
            self._rcoros[rcoro] = (client, 0)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
            client._await_()
        else:
            raise StopIteration(asyncoro.MonitorException(None, (type(rcoro), rcoro)))

    def map_results(self, gen, iter):
        """Execute generator 'gen' with arguments from given iterable. The
        return value is list of results that correspond to executing 'gen' with
        arguments in iterable in the same order.

        Must be used with 'yield', as for example,
        'results = yield scheduler.map_results(generator, list_of_tuples)'.
        """
        def exec_proc(gen, *args):
            yield self.execute(gen, *args)

        coros = []
        append_coro = coros.append
        for params in iter:
            if not isinstance(params, tuple):
                if hasattr(params, '__iter__'):
                    params = tuple(params)
                else:
                    params = (params,)
            append_coro(Coro(exec_proc, gen, *params))
        results = [None] * len(coros)
        for i, coro in enumerate(coros):
            result = yield coro.finish()
            results[i] = result
        raise StopIteration(results)

    def submit_at(self, where, gen, *args, **kwargs):
        """Similar to 'run_at' method of computation. If 'where' is None, the
        calling coroutine is blocked until any server is discovered and
        initialized (so computation's 'run_at' will not fail). Unlike
        'schedule', this method doesn't wait for server to be free (i.e., not
        running any other coroutines), nor unlike 'execute_at', the caller is
        not blocked until the coroutine finishes.

        Must be used with 'yield', similar to 'run_at' method of Computation
        instance. The value returned is result of 'run_at' method of computation
        (reference to remote coroutine in case of success, and error otherwise).
        """
        if not where:
            if not self._servers and not self._rcoros:
                yield self._server_avail.wait()
        rcoro = yield self.computation.run_at(where, gen, *args, **kwargs)
        if isinstance(rcoro, Coro):
            self._rcoros[rcoro] = (None, 0)
            if self._askew_results:
                msg = self._askew_results.pop(rcoro, None)
                if msg:
                    self.status_coro.send(msg)
        raise StopIteration(rcoro)

    def submit(self, gen, *args, **kwargs):
        """Submit coroutine at any server; see 'submit_at' above.
        """
        yield self.submit_at(None, gen, *args, **kwargs)

    def finish(self, close=False):
        """Wait until all scheduled coroutines finish. If 'close' is True, the
        computation is closed as well.

        Must be used with 'yield' as 'yield job_scheduler.finish()'.
        """

        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            if self._proc_close:
                coros = [Coro(self._proc_close, discoro.Scheduler.ServerInitialized, location)
                         for location in self._close_servers]
                self._close_servers = {}
                for coro in coros:
                    yield coro.finish()
            else:
                self._close_servers = {}
        self._rcoros_done.clear()
        if self._rcoros:
            yield self._rcoros_done.wait()
        if close:
            yield self.computation.close()
            self._askew_results.clear()

    def _status_proc(self, coro=None):
        """Internal use only. Coroutine to process discoro scheduler messages.
        """

        coro.set_daemon()
        coro.scheduler().atexit(15, lambda: Coro(self.finish, True).value())
        while 1:
            msg = yield coro.receive()
            if isinstance(msg, asyncoro.MonitorException):
                if msg.args[1][0] == discoro.Scheduler.ServerClosed:
                    continue
                rcoro = msg.args[0]
                client, use_count = self._rcoros.pop(rcoro, ('missing', 0))
                if client is None:
                    pass
                elif isinstance(client, Coro):
                    client._proceed_(msg.args[1][1])
                elif client == 'missing':
                    # Due to 'yield' used to create rcoro, scheduler may not
                    # have updated self._rcoros before the coroutine's
                    # MonitorException is received, so put it in
                    # 'askew_results'. The scheduling coroutine will resend it
                    # when it receives rcoro
                    self._askew_results[rcoro] = msg
                    continue

                else:
                    asyncoro.logger.warning('RemoteCoroScheduler: invalid status message ignored')
                    continue
                if use_count:
                    self._servers[rcoro.location] = rcoro.location
                    self._server_avail.set()
                if not self._rcoros:
                    self._rcoros_done.set()

            elif isinstance(msg, DiscoroStatus):
                if msg.status == discoro.Scheduler.ServerInitialized:
                    if self._proc_available:
                        def setup_proc(self, msg, coro=None):
                            if self._remote_scheduler:
                                yield self.asyncoro.peer(msg.info)
                            if (yield Coro(self._proc_available, msg.info).finish()) == 0:
                                self._close_servers[msg.info] = msg.info
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(setup_proc, self, msg)
                    elif self._status:
                        def status_proc(self, msg, coro=None):
                            if (yield Coro(self._status, msg.status, msg.info).finish()) == 0:
                                self._servers[msg.info] = msg.info
                                self._server_avail.set()
                        Coro(status_proc, self, msg)
                    else:
                        self._servers[msg.info] = msg.info
                        self._server_avail.set()

                elif msg.status == discoro.Scheduler.ServerClosed:
                    self._servers.pop(msg.info, None)
                    if self._close_servers.pop(msg.info, None) and self._proc_close:
                        Coro(self._proc_close, msg.status, msg.info)
                    elif self._status:
                        Coro(self._status, msg.status, msg.info)
                elif msg.status == discoro.Scheduler.NodeDiscovered:
                    if self._node_available:
                        def setup_node(self, msg, coro=None):
                            if self._remote_scheduler:
                                yield self.asyncoro.peer(msg.info.location)
                            try:
                                params = yield asyncoro.Coro(self._node_available,
                                                             msg.info).finish()
                            except:
                                raise StopIteration

                            if not isinstance(params, tuple):
                                if hasattr(params, '__iter__'):
                                    params = tuple(params)
                                else:
                                    params = (params,)

                            msg = {'req': 'setup_node', 'addr': msg.info.location.addr,
                                   'params': params, 'auth': self.computation._auth,
                                   'client': coro}
                            self.computation.scheduler.send(msg)
                        Coro(setup_node, self, msg)
                elif msg.status == discoro.Scheduler.ComputationScheduled:
                    self.computation_sign = msg.info
                    if self.computation.scheduler.location != self.asyncoro.location:
                        self._remote_scheduler = True
                    if self._status:
                        Coro(self._status, msg.status, msg.info)
                elif (msg.status == discoro.Scheduler.ComputationClosed and
                      msg.info == self.computation_sign):
                    if self._status:
                        Coro(self._status, msg.status, msg.info)
                    raise StopIteration
                elif msg.status != discoro.Scheduler.CoroCreated:
                    if self._status:
                        Coro(self._status, msg.status, msg.info)
Esempio n. 15
0
    def __init__(self, cpus, ip_addr=None, ext_ip_addr=None, node_port=None,
                 scheduler_node=None, scheduler_port=None,
                 dest_path_prefix='', secret='', keyfile=None, certfile=None,
                 max_file_size=None, zombie_interval=60):
        assert 0 < cpus <= multiprocessing.cpu_count()
        self.cpus = cpus
        if ip_addr:
            ip_addr = _node_ipaddr(ip_addr)
            if not ip_addr:
                raise Exception('invalid ip_addr')
        else:
            self.name = socket.gethostname()
            ip_addr = socket.gethostbyname(self.name)
        if ext_ip_addr:
            ext_ip_addr = _node_ipaddr(ext_ip_addr)
            if not ext_ip_addr:
                raise Exception('invalid ext_ip_addr')
        else:
            ext_ip_addr = ip_addr
        try:
            self.name = socket.gethostbyaddr(ext_ip_addr)[0]
        except:
            self.name = socket.gethostname()
        if not node_port:
            node_port = 51348
        if not scheduler_port:
            scheduler_port = 51347

        self.ip_addr = ip_addr
        self.ext_ip_addr = ext_ip_addr
        self.scheduler_port = scheduler_port
        self.pulse_interval = None
        self.keyfile = keyfile
        self.certfile = certfile
        if self.keyfile:
            self.keyfile = os.path.abspath(self.keyfile)
        if self.certfile:
            self.certfile = os.path.abspath(self.certfile)

        self.asyncoro = AsynCoro()

        self.tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if self.certfile:
            self.tcp_sock = ssl.wrap_socket(self.tcp_sock, keyfile=self.keyfile,
                                            certfile=self.certfile)
        self.tcp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.tcp_sock.bind((self.ip_addr, node_port))
        self.address = self.tcp_sock.getsockname()
        self.tcp_sock.listen(30)

        if dest_path_prefix:
            self.dest_path_prefix = dest_path_prefix.strip().rstrip(os.sep)
        else:
            self.dest_path_prefix = os.path.join(os.sep, 'tmp', 'dispy')
        if not os.path.isdir(self.dest_path_prefix):
            os.makedirs(self.dest_path_prefix)
            os.chmod(self.dest_path_prefix, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        if max_file_size is None:
            max_file_size = MaxFileSize
        self.max_file_size = max_file_size

        self.avail_cpus = self.cpus
        self.computations = {}
        self.scheduler_ip_addr = None
        self.file_uses = {}
        self.job_infos = {}
        self.lock = asyncoro.Lock()
        self.terminate = False
        self.signature = os.urandom(20).encode('hex')
        self.auth_code = hashlib.sha1(self.signature + secret).hexdigest()
        self.zombie_interval = 60 * zombie_interval

        logger.debug('auth_code for %s: %s', ip_addr, self.auth_code)

        self.udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        self.udp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.udp_sock.bind(('', node_port))
        logger.info('serving %s cpus at %s:%s', self.cpus, self.ip_addr, node_port)
        logger.debug('tcp server at %s:%s', self.address[0], self.address[1])
        self.udp_sock = AsynCoroSocket(self.udp_sock, blocking=False)

        scheduler_ip_addr = _node_ipaddr(scheduler_node)

        self.reply_Q = multiprocessing.Queue()
        self.reply_Q_thread = threading.Thread(target=self.__reply_Q)
        self.reply_Q_thread.start()

        self.timer_coro = Coro(self.timer_task)
        # self.tcp_coro = Coro(self.tcp_server)
        self.udp_coro = Coro(self.udp_server, scheduler_ip_addr)
Esempio n. 16
0
class Message_Router():
    _instance = None
    @classmethod
    def instance(cls):
        if not cls._instance:
            cls._instance = Message_Router( )
        return cls._instance

    _commands = None
    exit = False
    def __init__(self):
        self._services = {}

        logger.setLevel(logging.INFO)

        # thread pool -- will burn up if services use the thread for blocking & totally
        # kill application communication. If you have to block for I/O then you better
        # be using async in the destination service
        #for i in range(2 * multiprocessing.cpu_count()) :
        #self._coro_dispatcher = \
        self._dispatcher_coro = Coro(self._message_dispatcher)

    def _message_dispatcher(self, coro=None):
        coro.set_daemon()
        thread_pool = AsynCoroThreadPool(2 * multiprocessing.cpu_count())
        while True:
            try:
                message = yield coro.receive()

                if self.exit:  #abandon any work & just cleanly exit
                    break

                yield thread_pool.async_task(coro, self._dispatch_message, message)
            except:
                show_error()
                #raise

        print "Coro(_message_dispatcher) exiting"

    def _dispatch_message(self,message):
        if message.dest_service in self._services.keys():
            sw = Stopwatch()
            self._services[message.dest_service].handle_message(message)

            #if message.type == Message_Recv_Peer_Data.Type():
            #    logger.debug( "ROUTER: net receiving " + message.network_msg.Type())
            #elif message.type == Message_Send_Peer_Data.Type():
            #    logger.debug( "ROUTER: net dispatching " + message.network_msg.Type())
            #else:
            #    logger.debug( "ROUTER: dispatching " + message.Type())

            # for long running tasks (CPU-bound) you should pass off to a dedicated thread or tune
            # for I/O bound tasks you should queue the work until an I/O thread-pool thread can handle it
            if sw.ms() > 50:
                print "INVESTIGATE!!! %s(%s) took %0.3f ms!! Tuning may be required.'" % (message.dest_service, message.Type(), sw.ms())
        else:
            print "Unregistered service '" + message.service + "'"

    def register_service(self, service_id, service):
        if not service_id in self._services.keys():
            self._services[service_id] = service

    def route(self, message):
        self._dispatch_message(message)
        #if len(self._dispatcher_coro._msgs) > 100:
        #    print "Backlog is " + str(len(self._dispatcher_coro._msgs)) + " on message dispatch!!! Find blocking service."
        #if not self.exit:
        #    self._dispatcher_coro.send(message)

    def stop(self):
        self.exit = True

        # tell all my services to stop
        for service in self._services.values():
            try:
                service.stop()
            except:
                show_error()

        self._dispatcher_coro.send(None)
        time.sleep(.1)
        AsynCoro.instance().terminate()

    def attach_console(self):
        while True:
            try:
                cmd = raw_input()
            except EOFError: #the user does not have a terminal
                return

            if cmd == "q" or cmd == "Q" or cmd == "quit" or cmd == "exit":
                print "Exiting..."
                break

            try:
                node_info = None
                if SERVICE_NODE in self._services.keys():
                    node_info = self._services[SERVICE_NODE].get_console_node( )

                args = []
                splitted = cmd.split(' ',1)
                if len(cmd) == 0:  # default action
                    self.route(Message_Console_Command(SERVICE_NODE, "print", args, node_info))
                elif len(splitted) == 1:  # try to find a service with the command
                    for svc in self._services.values():
                        if splitted[0] in svc.attach_to_console( ):
                            splitted.insert(0,svc.service_id)
                            break

                if len(splitted) > 1:
                    svcName = splitted[0]  # consoleName
                    command = splitted[1]  # commandName


                    # attempt to lookup service by consoleName
                    found = False
                    for svc in self._services.values():
                        if svc.cl_name == svcName or svc.service_id == svcName:
                            for i in range(2, len(splitted)):
                                args.append( splitted[i] )

                            svc.handle_message(Message_Console_Command(svc.service_id, command, args, node_info))
                            found = True
                            break

                    if not found:  # see if a command was entered without a service name
                        for svc in self._services.values():
                            if svcName in svc.attach_to_console():
                                command = svcName
                                for i in range(1, len(splitted)):
                                    args.append( splitted[i] )

                                svc.handle_message(Message_Console_Command(svc.service_id, command, args, node_info))
                                found = True
                                break

                    if not found:
                        print svcName + " is an unregistered service."
            except:
                show_error()
Esempio n. 17
0
def _discoro_proc():
    # coroutine
    """Server process receives computations and runs coroutines for it.
    """

    import os
    import shutil
    import traceback
    import sys
    import time

    try:
        import psutil
    except:
        psutil = None

    import asyncoro.disasyncoro as asyncoro
    from asyncoro import Coro
    from asyncoro.discoro import MinPulseInterval, MaxPulseInterval, \
         DiscoroNodeInfo, DiscoroNodeStatus

    _discoro_coro = asyncoro.AsynCoro.cur_coro()
    _discoro_config = yield _discoro_coro.receive()
    assert _discoro_config['req'] == 'config'
    _discoro_coro.register('discoro_server')
    _discoro_name = asyncoro.AsynCoro.instance().name
    asyncoro.AsynCoro.instance().dest_path = os.path.join('discoro',
                                                          'server%s' % (_discoro_config['id']))
    _discoro_dest_path = asyncoro.AsynCoro.instance().dest_path
    _discoro_pid_path = os.path.join(_discoro_dest_path, '..',
                                     'server%s.pid' % (_discoro_config['id']))
    _discoro_pid_path = os.path.normpath(_discoro_pid_path)
    # TODO: is file locking necessary?
    if os.path.exists(_discoro_pid_path):
        with open(_discoro_pid_path, 'r') as _discoro_req:
            _discoro_var = _discoro_req.read()
        _discoro_var = int(_discoro_var)
        if not _discoro_config['phoenix']:
            print('\n   Another discoronode seems to be running;\n'
                  '   make sure server with PID %d quit and remove "%s"\n' %
                  (_discoro_var, _discoro_pid_path))
            _discoro_var = os.getpid()

        import signal
        try:
            os.kill(_discoro_var, signal.SIGTERM)
        except:
            pass
        else:
            time.sleep(0.1)
            try:
                if os.waitpid(_discoro_var, os.WNOHANG)[0] != _discoro_var:
                    asyncoro.logger.warning('Killing process %d failed' % _discoro_var)
            except:
                pass
        del signal
    if os.path.isdir(_discoro_dest_path):
        shutil.rmtree(_discoro_dest_path)
    os.makedirs(_discoro_dest_path)
    os.chdir(_discoro_dest_path)
    with open(_discoro_pid_path, 'w') as _discoro_var:
        _discoro_var.write('%s' % os.getpid())
    asyncoro.logger.debug('discoro server "%s" started at %s; '
                          'computation files will be saved in "%s"' %
                          (_discoro_name, _discoro_coro.location, _discoro_dest_path))
    _discoro_req = _discoro_client = _discoro_auth = _discoro_msg = None
    _discoro_timer_coro = _discoro_pulse_coro = _discoro_timer_proc = _discoro_peer_status = None
    _discoro_monitor_coro = _discoro_monitor_proc = _discoro_node_status = None
    _discoro_computation = _discoro_func = _discoro_var = None
    _discoro_job_coros = set()
    _discoro_busy_time = time.time()
    _discoro_globals = {}
    _discoro_locals = {}
    _discoro_globals.update(globals())
    _discoro_locals.update(locals())

    def _discoro_timer_proc(coro=None):
        coro.set_daemon()
        last_pulse = time.time()
        interval = None
        while True:
            reset = yield coro.sleep(interval)
            if reset:
                if not isinstance(_discoro_pulse_coro, Coro):
                    interval = None
                    continue
                interval = reset
                last_pulse = time.time()
                continue
            if not _discoro_pulse_coro:
                continue
            msg = {'ncoros': len(_discoro_job_coros), 'location': coro.location}
            if _discoro_node_status:
                msg['node_status'] = DiscoroNodeStatus(coro.location.addr, psutil.cpu_percent(),
                                                       psutil.virtual_memory().percent,
                                                       psutil.disk_usage(_discoro_dest_path).percent)

            if _discoro_pulse_coro.send(msg) == 0:
                last_pulse = time.time()
            elif (time.time() - last_pulse) > (5 * interval) and _discoro_computation:
                asyncoro.logger.warning('scheduler is not reachable; closing computation "%s"' %
                                        _discoro_computation._auth)
                _discoro_coro.send({'req': 'close', 'auth': _discoro_computation._auth})

            if ((not _discoro_job_coros) and _discoro_computation.zombie_period and
               ((time.time() - _discoro_busy_time) > _discoro_computation.zombie_period)):
                asyncoro.logger.debug('%s: zombie computation "%s"' %
                                      (coro.location, _discoro_computation._auth))
                # TODO: close? For now wait for "too many" timeouts to close

    def _discoro_peer_status(coro=None):
        coro.set_daemon()
        while True:
            status = yield coro.receive()
            if isinstance(status, asyncoro.PeerStatus) and \
               status.status == asyncoro.PeerStatus.Offline and \
               _discoro_pulse_coro and _discoro_pulse_coro.location == status.location:
                asyncoro.logger.debug('scheduler at %s quit; closing computation %s' %
                                      (status.location, _discoro_computation._auth))
                msg = {'req': 'close', 'auth': _discoro_computation._auth}
                _discoro_coro.send(msg)

    def _discoro_monitor_proc(coro=None):
        nonlocal _discoro_busy_time
        coro.set_daemon()
        while True:
            msg = yield coro.receive()
            if isinstance(msg, asyncoro.MonitorException):
                _discoro_busy_time = time.time()
                asyncoro.logger.debug('job %s done' % msg.args[0])
                _discoro_job_coros.discard(msg.args[0])
            else:
                asyncoro.logger.warning('%s: invalid monitor message ignored' % coro.location)

    _discoro_timer_coro = Coro(_discoro_timer_proc)
    _discoro_monitor_coro = Coro(_discoro_monitor_proc)
    asyncoro.AsynCoro.instance().peer_status(Coro(_discoro_peer_status))

    while True:
        _discoro_msg = yield _discoro_coro.receive()
        if not isinstance(_discoro_msg, dict):
            continue
        _discoro_req = _discoro_msg.get('req', None)

        if _discoro_req == 'run':
            _discoro_client = _discoro_msg.get('client', None)
            _discoro_auth = _discoro_msg.get('auth', None)
            _discoro_func = _discoro_msg.get('func', None)
            if not isinstance(_discoro_client, Coro) or not _discoro_computation or \
               _discoro_auth != _discoro_computation._auth:
                asyncoro.logger.warning('invalid run: %s' % (type(_discoro_func)))
                if isinstance(_discoro_client, Coro):
                    _discoro_client.send(None)
                continue
            try:
                _discoro_func = asyncoro.unserialize(_discoro_func)
                if _discoro_func.code:
                    exec(_discoro_func.code, globals())
                job_coro = Coro(globals()[_discoro_func.name],
                                *(_discoro_func.args), **(_discoro_func.kwargs))
            except:
                asyncoro.logger.debug('invalid computation to run')
                # _discoro_func = Scheduler._Function(_discoro_func.name, None,
                #                                     _discoro_func.args, _discoro_func.kwargs)
                job_coro = (sys.exc_info()[0], getattr(_discoro_func, 'name', _discoro_func),
                            traceback.format_exc())
            else:
                asyncoro.logger.debug('job %s created' % job_coro)
                _discoro_job_coros.add(job_coro)
                job_coro.notify(_discoro_monitor_coro)
                _discoro_var = _discoro_msg.get('notify', None)
                if isinstance(_discoro_var, Coro):
                    job_coro.notify(_discoro_var)
            _discoro_busy_time = time.time()
            _discoro_client.send(job_coro)
            del job_coro
        elif _discoro_req == 'setup':
            _discoro_client = _discoro_msg.get('client', None)
            _discoro_pulse_coro = _discoro_msg.get('pulse_coro', None)
            if not isinstance(_discoro_client, Coro) or not isinstance(_discoro_pulse_coro, Coro):
                continue
            if _discoro_computation is not None:
                asyncoro.logger.debug('invalid "setup" - busy')
                _discoro_client.send(-1)
                continue
            os.chdir(_discoro_dest_path)
            try:
                _discoro_computation = _discoro_msg['computation']
                exec('import asyncoro.disasyncoro as asyncoro', globals())
                if __name__ == '__mp_main__':  # Windows multiprocessing process
                    exec('import asyncoro.disasyncoro as asyncoro',
                         sys.modules['__mp_main__'].__dict__)
                if _discoro_computation._code:
                    exec(_discoro_computation._code, globals())
                    if __name__ == '__mp_main__':  # Windows multiprocessing process
                        exec(_discoro_computation._code, sys.modules['__mp_main__'].__dict__)
            except:
                _discoro_computation = None
                asyncoro.logger.warning('invalid computation')
                asyncoro.logger.debug(traceback.format_exc())
                _discoro_client.send(-1)
                continue
            if psutil and _discoro_msg.get('node_status', None):
                _discoro_node_status = True
            if isinstance(_discoro_computation.pulse_interval, int) and \
               MinPulseInterval <= _discoro_computation.pulse_interval <= MaxPulseInterval:
                _discoro_computation.pulse_interval = _discoro_computation.pulse_interval
            else:
                _discoro_computation.pulse_interval = MinPulseInterval
            _discoro_timer_coro.resume(_discoro_computation.pulse_interval)
            _discoro_busy_time = time.time()
            asyncoro.logger.debug('computation "%s" from %s' %
                                  (_discoro_computation._auth, _discoro_msg['client'].location))
            _discoro_client.send(0)
        elif _discoro_req == 'close':
            _discoro_auth = _discoro_msg.get('auth', None)
            if not _discoro_computation or (_discoro_auth != _discoro_computation._auth and
                                            _discoro_auth != _discoro_config['auth']):
                continue
            asyncoro.logger.debug('%s deleting computation "%s"' %
                                  (_discoro_coro.location, _discoro_computation._auth))
            if _discoro_auth != _discoro_computation._auth and _discoro_pulse_coro:
                _discoro_pulse_coro.send({'status': 'ServerClosed',
                                          'location': _discoro_coro.location})
            for _discoro_var in _discoro_job_coros:
                _discoro_var.terminate()
            _discoro_job_coros = set()

            if __name__ == '__mp_main__':  # Windows multiprocessing process
                for _discoro_var in list(globals()):
                    if _discoro_var not in _discoro_globals:
                        globals().pop(_discoro_var, None)
                        sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None)
                globals().update(_discoro_globals)
                sys.modules['__mp_main__'].__dict__.update(_discoro_globals)
            else:
                for _discoro_var in list(globals()):
                    if _discoro_var not in _discoro_globals:
                        globals().pop(_discoro_var, None)
                globals().update(_discoro_globals)

            for _discoro_var in os.listdir(_discoro_dest_path):
                _discoro_var = os.path.join(_discoro_dest_path, _discoro_var)
                if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var):
                    shutil.rmtree(_discoro_var, ignore_errors=True)
                else:
                    os.remove(_discoro_var)
            if not os.path.isdir(_discoro_dest_path):
                try:
                    os.remove(_discoro_dest_path)
                except:
                    pass
                os.makedirs(_discoro_dest_path)
            if not os.path.isfile(_discoro_pid_path):
                try:
                    if os.path.islink(_discoro_pid_path):
                        os.remove(_discoro_pid_path)
                    else:
                        shutil.rmtree(_discoro_pid_path)
                    with open(_discoro_pid_path, 'w') as _discoro_var:
                        _discoro_var.write('%s' % os.getpid())
                except:
                    asyncoro.logger.warning('PID file "%s" is invalid' % _discoro_pid_path)
            os.chdir(_discoro_dest_path)
            asyncoro.AsynCoro.instance().dest_path = _discoro_dest_path
            _discoro_computation = _discoro_client = _discoro_pulse_coro = None
            _discoro_node_status = None
            if _discoro_config['serve'] > 0:
                _discoro_config['serve'] -= 1
                if _discoro_config['serve'] == 0:
                    break
            _discoro_timer_coro.resume(MinPulseInterval)
        elif _discoro_req == 'node_info':
            if psutil:
                info = DiscoroNodeInfo(
                    _discoro_name, _discoro_coro.location.addr,
                    psutil.cpu_count(), psutil.cpu_percent(),
                    {_discoro_var: getattr(psutil.virtual_memory(), _discoro_var)
                     for _discoro_var in ['total', 'percent']},
                    {_discoro_var: getattr(psutil.disk_usage(_discoro_dest_path), _discoro_var)
                     for _discoro_var in ['total', 'percent']}
                    )
                if _discoro_msg.get('node_status', None):
                    _discoro_node_status = True
            else:
                info = DiscoroNodeInfo(_discoro_name, _discoro_coro.location.addr,
                                       -1, -1, None, None)
            _discoro_client = _discoro_msg.get('client', None)
            if not isinstance(_discoro_client, Coro):
                continue
            _discoro_client.send(info)
        elif _discoro_req == 'status':
            if _discoro_msg.get('auth', None) != _discoro_config['auth']:
                asyncoro.logger.debug('ignoring info: %s' % (_discoro_msg.get('auth')))
                continue
            if _discoro_pulse_coro:
                print('  Server %s running %d coroutines for computation at %s' %
                      (_discoro_coro.location, len(_discoro_job_coros),
                       _discoro_pulse_coro.location))
            else:
                print('  Server %s not used by any computation' % (_discoro_coro.location))
        elif _discoro_req == 'quit':
            if _discoro_msg.get('auth', None) != _discoro_config['auth']:
                asyncoro.logger.debug('ignoring quit: %s' % (_discoro_msg.get('auth')))
                continue
            if _discoro_pulse_coro:
                _discoro_pulse_coro.send({'status': 'ServerClosed',
                                          'location': _discoro_coro.location})
            break
        elif _discoro_req == 'terminate':
            if _discoro_msg.get('auth', None) != _discoro_config['auth']:
                asyncoro.logger.debug('ignoring terminate: %s' % (_discoro_msg.get('auth')))
                continue
            if _discoro_pulse_coro:
                _discoro_pulse_coro.send({'status': 'ServerTerminated',
                                          'location': _discoro_coro.location})
            if _discoro_computation:
                msg = {'req': 'close', 'auth': _discoro_computation._auth}
                _discoro_config['serve'] = 1
                _discoro_coro.send(msg)
            else:
                break
        else:
            asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req)
            _discoro_client = _discoro_msg.get('client', None)
            if not isinstance(_discoro_client, Coro):
                continue
            _discoro_client.send(-1)

    # wait until all computations are done; process only 'close'
    while _discoro_job_coros:
        _discoro_msg = yield _discoro_coro.receive()
        if not isinstance(_discoro_msg, dict):
            continue
        _discoro_req = _discoro_msg.get('req', None)

        if _discoro_req == 'close':
            _discoro_auth = _discoro_msg.get('auth', None)
            if not _discoro_computation or _discoro_auth != _discoro_computation._auth:
                continue
            asyncoro.logger.debug('%s deleting computation "%s"' %
                                  (_discoro_coro.location, _discoro_computation._auth))

            if __name__ == '__mp_main__':  # Windows multiprocessing process
                for _discoro_var in list(globals()):
                    if _discoro_var not in _discoro_globals:
                        globals().pop(_discoro_var, None)
                        sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None)
                globals().update(_discoro_globals)
                sys.modules['__mp_main__'].__dict__.update(_discoro_globals)
            else:
                for _discoro_var in list(globals()):
                    if _discoro_var not in _discoro_globals:
                        globals().pop(_discoro_var, None)
                globals().update(_discoro_globals)

            break
        else:
            asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req)
            _discoro_client = _discoro_msg.get('client', None)
            if not isinstance(_discoro_client, Coro):
                continue
            _discoro_client.send(-1)

    for _discoro_var in os.listdir(_discoro_dest_path):
        _discoro_var = os.path.join(_discoro_dest_path, _discoro_var)
        if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var):
            shutil.rmtree(_discoro_var, ignore_errors=True)
        else:
            os.remove(_discoro_var)
    if os.path.isfile(_discoro_pid_path):
        os.remove(_discoro_pid_path)
    _discoro_config['mp_queue'].put(_discoro_config['auth'])
    asyncoro.logger.debug('discoro server %s quit' % _discoro_coro.location)
Esempio n. 18
0
class Peer_Remote():  # outbound connections
    def __init__(self, network_service, remote_ip, remote_port, context=None):
        self.exit = False
        self.network_service = network_service
        self.remote_ip = remote_ip
        self.remote_port = remote_port
        self.context = context

        Coro(self._server_connect)

    def _server_connect(self, coro=None):
        try:
            #logger.debug('CLIENT: connecting to peer at %s:%s', self.remote_ip, str(self.remote_port))
            self.outbound_socket = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_STREAM))
            self.outbound_socket.setsockopt(socket.SOL_TCP, socket.TCP_NODELAY, 1) # if you're gonna act like UDP

            yield self.outbound_socket.connect((self.remote_ip, self.remote_port))
            #logger.debug('CLIENT: connected to peer at %s:%s', self.remote_ip, str(self.remote_port))
            self._send_coro = Coro(self._client_send)
            #Coro(self._client_recv) # unneeded if we don't utilize bi-directional communication in UDP style messaging

            self.network_service.on_server_connect(self, self.context)
        except:
            show_error()
            #raise

    def _client_recv(self, coro=None):
        while True:
            try:
                data = yield self.outbound_socket.recv_msg()
                if data == None or len(data) == 0 or self.exit:
                    break
                #logger.debug('CLIENT: received data to peer at %s:%s (Data: %s)', self.remote_ip, str(self.remote_port), data)
                self.network_service.on_peer_data_received(data)
            except:
                show_error()
                #break
        #print "Coro(_client_recv) exiting"

    def _client_send(self, coro=None):
        coro.set_daemon()
        while True:
            try:
                cmd, state = yield self._send_coro.receive()
                data, context = state
                if cmd == NETWORK_PEER_DISCONNECT:
                    self.network_service.on_client_disconnected(context)
                    break

                #logger.debug('CLIENT: sending data to %s:%s (Data is: %s)', self.remote_ip, self.remote_port,data)
                yield self.outbound_socket.send_msg(data)
                self.network_service.on_client_data_sent(context)
            except:
                show_error()
                #break

        self.outbound_socket.shutdown(socket.SHUT_RDWR)
        self.outbound_socket.close()
        #logger.debug('CLIENT: disconnected from %s:%s', self.remote_ip, str(self.remote_port))
        #print "Coro(_client_send) exiting"


    def send(self, data, context):
        if not self.exit:
            self._send_coro.send((None, (data, context)))

    def stop(self, context=None):
        self.exit = True
        #logger.debug('CLIENT: disconnecting from %s:%s', self.remote_ip, str(self.remote_port))
        self._send_coro.send((NETWORK_PEER_DISCONNECT, (None,context)))
Esempio n. 19
0
    def __init__(self, computation, proc_status=None, proc_available=None, proc_close=None):
        """'computation' should be an instance of discoro.Computation

        'proc_status' if not None should be a generator function that is called
        (as coroutine) with the status and info, as received by status_coro. If
        status is ServerInitialized and this function returns non-zero value,
        the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute'
        will not use that server.

        'proc_available' if not None should be a generator function that is
        called (as coroutine) with the location of a server process when it
        becomes available (after all 'depends' of computation have been
        transferred). The coroutine runs at the client; it can create remote
        coroutine(s) at the server process, perhaps to setup, such as
        initializing global variables, transfer additional files etc. The
        coroutine should exit with 0 to indicate successful setup; any other
        value is interpretted as failure and not used by scheduler.

        'proc_close' if not None should be a generator function that is called
        (as coroutine) with the status and location of server process when
        server is about to be closed, or already closed. The coroutine runs at
        the client; it can create remote coroutine(s) at server process to
        cleanup, such as delete global variables, transfer files back to client
        etc. The coroutine is called with two parameters: 'status', which is
        either 'discoro.Scheduler.ServerInitialized' when server is about to be
        closed (i.e., server is still available, and remote coroutines can be
        executed), or 'discoro.Scheduler.ServerClosed' when server is already
        closed (e.g., due to zombie_period time elapsed without communication,
        or server was manually closed with command-line etc.), and 'location' of
        server process.
        """

        if proc_status:
            if not inspect.isgeneratorfunction(proc_status):
                asyncoro.logger.warning('Invalid proc_status ignored')
                proc_status = None
        if proc_available:
            if not inspect.isgeneratorfunction(proc_available):
                asyncoro.logger.warning('Invalid proc_available ignored')
                proc_available = None
        if proc_close:
            if not inspect.isgeneratorfunction(proc_close):
                asyncoro.logger.warning('Invalid proc_close ignored')
                proc_close = None

        self._proc_status = proc_status
        self._proc_available = proc_available
        self._proc_close = proc_close
        self._close_servers = {}

        self.computation = computation
        self.computation_sign = None
        self.status_coro = Coro(self._status_proc)
        if not computation.status_coro:
            computation.status_coro = self.status_coro
        self._rcoros = {}
        self._rcoros_done = asyncoro.Event()
        self._askew_results = {}
        self._servers = {}
        self._server_avail = asyncoro.Event()
        Coro(computation.schedule)
Esempio n. 20
0
    def __init__(self, computation, status=None, node_available=None,
                 proc_available=None, proc_close=None):
        """'computation' should be an instance of discoro.Computation

        'status' if not None should be a generator function that is called
        (as coroutine) with the status and info, as received by status_coro. If
        status is ServerInitialized and this function returns non-zero value,
        the server is ignored; i.e., jobs scheduled with 'schedule' or 'execute'
        will not use that server.

        'proc_available' if not None should be a generator function that is
        called (as coroutine) with the location of a server process when it
        becomes available (after all 'depends' of computation have been
        transferred). The coroutine runs at the client; it can create remote
        coroutine(s) at the server process, perhaps to setup, such as
        initializing global variables, transfer additional files etc. The
        coroutine should exit with 0 to indicate successful setup; any other
        value is interpretted as failure and not used by scheduler.

        'proc_close' if not None should be a generator function that is called
        (as coroutine) with the status and location of server process when
        server is about to be closed, or already closed. The coroutine runs at
        the client; it can create remote coroutine(s) at server process to
        cleanup, such as delete global variables, transfer files back to client
        etc. The coroutine is called with two parameters: 'status', which is
        either 'discoro.Scheduler.ServerInitialized' when server is about to be
        closed (i.e., server is still available, and remote coroutines can be
        executed), or 'discoro.Scheduler.ServerClosed' when server is already
        closed (e.g., due to zombie_period time elapsed without communication,
        or server was manually closed with command-line etc.), and 'location' of
        server process.
        """

        if status:
            if not inspect.isgeneratorfunction(status):
                asyncoro.logger.warning('Invalid status ignored')
                status = None
        if proc_available:
            if not inspect.isgeneratorfunction(proc_available):
                asyncoro.logger.warning('Invalid proc_available ignored')
                proc_available = None
        if proc_close:
            if not inspect.isgeneratorfunction(proc_close):
                asyncoro.logger.warning('Invalid proc_close ignored')
                proc_close = None
        if not node_available and computation._node_available:
            node_available = computation._node_available
        if node_available:
            if not inspect.isgeneratorfunction(node_available):
                asyncoro.logger.warning('Invalid node_available ignored')
                node_available = None

        self._status = status
        self._proc_available = proc_available
        self._proc_close = proc_close
        self._node_available = node_available
        self._close_servers = {}

        self.computation = computation
        self.computation_sign = None
        self.status_coro = Coro(self._status_proc)
        if isinstance(computation.status_coro, Coro):
            def chain_status_msgs(status_coro, client, coro=None):
                coro.set_daemon()
                while True:
                    msg = yield coro.receive()
                    client.send(msg)
                    status_coro.send(msg)
            computation.status_coro = Coro(chain_status_msgs, self.status_coro,
                                           computation.status_coro)
        else:
            computation.status_coro = self.status_coro
        self._rcoros = {}
        self._rcoros_done = asyncoro.Event()
        self._askew_results = {}
        self._servers = {}
        self._server_avail = asyncoro.Event()
        self._remote_scheduler = False
        self.asyncoro = asyncoro.AsynCoro()
        Coro(computation.schedule)
Esempio n. 21
0
class Node_Service(Service):
    exit = False
    pause_scheduler = False

    def __init__(self, message_router):
        super(Node_Service, self).__init__(SERVICE_NODE, message_router, cl_name="ns")
        self.exit = False
        self.pause_scheduler = False
        self.nodes = {}
        self.queue = deque()
        self.delay_queue = deque()

        self.scheduler_thread = Thread(target=self._thread_scheduler)
        self.scheduler_thread.daemon = True
        self.scheduler_thread.start()

        self._stabilize_coro = Coro(self._coro_stabilize)

    def delay_enqueue(self,message,ms):
        if not self.pause_scheduler:
            self.delay_queue.append( (time.time(), ms, message))

    def _thread_scheduler(self):
        try:
            while not self.exit:
                requeue = deque()

                while len(self.delay_queue) > 0 and not self.exit:
                    queued_at, delay_ms, message = self.delay_queue.popleft()
                    if (queued_at + delay_ms / 1000) < time.time():
                        #self.enqueue(message)
                        self._stabilize_coro.send(message)
                    else:
                        requeue.append((queued_at, delay_ms, message))

                while len(requeue) > 0 and not self.exit:
                    self.delay_queue.append(requeue.popleft())
                del requeue
                time.sleep(.1)  # 10 ms
        except:
            show_error()
        print "Scheduler thread exiting"

    def _coro_stabilize(self, coro=None):
        coro.set_daemon()

        thread_pool = AsynCoroThreadPool(2 * multiprocessing.cpu_count())
        while not self.exit:
            try:
                command, context = yield self._stabilize_coro.receive()

                if self.exit:  # fast exit for now (non-graceful)
                    break

                #command, context = self.queue.popleft()
                if self.pause_scheduler or context.join_on_stabilize:  # just cycle the messages until unpaused
                    if context.join_on_stabilize:
                        context.send_message(Find_Successor_Message(context.thisNode, context.thisNode.key, context.thisNode), context.join_on_stabilize)
                        context.join_on_stabilize = None
                        delay = MAINTENANCE_PERIOD * 3
                    else:
                        delay = MAINTENANCE_PERIOD

                    self.delay_enqueue((command,context), delay)
                    continue


                if command == "NODE_STABILIZE":
                    yield thread_pool.async_task(coro, context.begin_stabilize)
                    self.delay_enqueue( ("NODE_CHECK_PREDECESSOR", context), MAINTENANCE_PERIOD)
                elif command == "NODE_CHECK_PREDECESSOR":
                    yield thread_pool.async_task(coro, context.begin_stabilize)
                    yield thread_pool.async_task(coro, context.check_predecessor)
                    self.delay_enqueue( ("NODE_FIX_FINGERS", context), MAINTENANCE_PERIOD )
                elif command == "NODE_FIX_FINGERS":
                    yield thread_pool.async_task(coro, context.fix_fingers, 10)
                    self.delay_enqueue( ("NODE_STABILIZE", context), MAINTENANCE_PERIOD)
            except:
                show_error()

        print "Coro(stabilize) exiting"


    def stop(self, context=None):
        self.exit = True
        self._stabilize_coro.send((None,None))
        #for i in range(2 * multiprocessing.cpu_count()):
            #self.queue.append(('terminate', context))
            #self.signal_item_queued.set()

    def get_console_node(self):
        if len(self.nodes) > 0:
            return self.nodes[self.nodes.keys()[0]]

    def check_scheduler_pause(self,msg):
        if not self.pause_scheduler:
            return False

        if msg.type == Message_Recv_Peer_Data.Type():
            msg = msg.network_msg

        result = False
        if msg.type == Stabilize_Reply_Message.Type():
            result = True
        elif msg.type == Stablize_Message.Type():
            result = True
        elif msg.type == Check_Predecessor_Message.Type():
            result = True
        elif msg.type == Update_Message.Type():
            result = True
        elif msg.type == Find_Successor_Message.Type():
            result = True
        return result

    def handle_message(self, msg):
        if not msg.dest_service == self.service_id:
            raise Exception("Mismatched service recipient for message.")

        if msg.type == Message_Setup_Node.Type():
            node = Node(self, msg.public_ip, msg.local_ip, msg.local_port)
            if msg.seeded_peers:
                for peer in msg.seeded_peers:
                    node.join(peer) # use callback to try next peer if join fails (join is async)
            else:
                node.join()

            self.message_router.route(
                Message_Start_Server(node.local_ip, node.local_port,
                        Message_Start_Server_Callback(self.service_id, node, True),
                        Message_Start_Server_Callback(self.service_id, node, False)))

        elif msg.type == Message_Start_Server_Callback.Type():
            if msg.result:
                self.nodes[str(msg.node)] = msg.node
                self.delay_enqueue(("NODE_STABILIZE", msg.node), 1000)
            else:
                msg.node.exit_network()
                raise Exception( "Unable to successfully start server for node at " + str(msg.node.thisNode))
        if msg.type == Message_Forward.Type( ):
            ni = msg.origin_node
            if self.nodes.has_key(str(ni)):
                lnode = self.nodes[str(ni)] # get the Node class this message is addressed to (ip:port)
                forward_node = lnode.find_ideal_forward(msg.forward_hash)

                if msg.forward_msg.type == Database_Get_Message.Type() or msg.forward_msg.type == Database_Put_Message.Type():
                    msg.forward_msg.storage_node = forward_node

                if forward_node != ni:
                    self.send_message(msg.forward_msg, forward_node)
                else:
                    self.send_message(msg.forward_msg)
        elif msg.type == Message_Recv_Peer_Data.Type(): # came off the wire
            ni = Node_Info(msg.local_ip if len(msg.local_ip) > 0 else "127.0.0.1",msg.local_port)
            if self.nodes.has_key(str(ni)):
                lnode = self.nodes[str(ni)] # get the Node class this message is addressed to (ip:port)

                msg = msg.network_msg
                rnode = lnode.final_destination(msg)
                if rnode != lnode.thisNode:
                    self.send_message(msg,rnode)
                    return

                logger.debug(str(ni) + " received network msg: " + msg.type)

                if msg.type == Message_Forward.Type( ):
                    lnode.find_ideal_forward(msg.forward_hash) #fix this...we need to do forward the message
                elif msg.type == Find_Successor_Message.Type():
                    self.send_message(Update_Message(lnode.thisNode, msg.reply_to.key, msg.finger), msg.reply_to)
                elif msg.type == Update_Message.Type( ):
                    lnode.update_finger(msg.reply_to, msg.finger)
                elif msg.type == Check_Predecessor_Message.Type():
                    self.send_message(Update_Message(lnode.thisNode, msg.reply_to.key, 0), msg.reply_to)
                elif msg.type == Stablize_Message.Type( ):
                    self.send_message(Stabilize_Reply_Message(lnode.thisNode, msg.reply_to.key, msg.reply_to))
                elif msg.type == Stabilize_Reply_Message.Type():
                    lnode.stabilize(msg)
                elif msg.type == Notify_Message.Type():
                    lnode.get_notified(msg)
                elif msg.type == Exit_Message.Type():
                    lnode.peer_polite_exit(msg.reply_to)
                elif msg.type == Database_Put_Message.Type() or msg.type == Database_Get_Message.Type():
                    msg.storage_node = lnode.thisNode
                    self.message_router.route(msg)
                elif msg.type == Database_Put_Message_Response.Type() or msg.type == Database_Get_Message_Response.Type():
                    self.message_router.route(msg)


        elif msg.type == Message_Console_Command.Type():
            self.handle_command(msg.command,msg.args)

    # wraps message in network packet if not destined for local
    #def route_node_message(self,msg):
    #    if self.nodes.has_key(str(msg.origin_node)):
    #        node = self.nodes[str(msg.origin_node)]
    #        forward_to = node.find_ideal_forward(msg.destination_key)
    #        if forward_to == node:
    #            self.send_message(msg)
    #        else:
    #            self.send_message(Message_Send_Peer_Data(forward_to, msg.serialize()))

    def handle_command(self,cmd,args=[]):
        if cmd == "print":
            for node in self.nodes.values():
                print "successor  ", node.thisNode.successor.print_key()
                print "predecessor", node.thisNode.predecessor.print_key()
        elif cmd == "pause":
            self.pause_scheduler = True
        elif cmd == "resume":
            self.pause_scheduler = False
Esempio n. 22
0
class _DispyNode(object):
    """Internal use only.
    """
    def __init__(self, cpus, ip_addr=None, ext_ip_addr=None, node_port=None,
                 scheduler_node=None, scheduler_port=None,
                 dest_path_prefix='', secret='', keyfile=None, certfile=None,
                 max_file_size=None, zombie_interval=60):
        assert 0 < cpus <= multiprocessing.cpu_count()
        self.cpus = cpus
        if ip_addr:
            ip_addr = _node_ipaddr(ip_addr)
            if not ip_addr:
                raise Exception('invalid ip_addr')
        else:
            self.name = socket.gethostname()
            ip_addr = socket.gethostbyname(self.name)
        if ext_ip_addr:
            ext_ip_addr = _node_ipaddr(ext_ip_addr)
            if not ext_ip_addr:
                raise Exception('invalid ext_ip_addr')
        else:
            ext_ip_addr = ip_addr
        try:
            self.name = socket.gethostbyaddr(ext_ip_addr)[0]
        except:
            self.name = socket.gethostname()
        if not node_port:
            node_port = 51348
        if not scheduler_port:
            scheduler_port = 51347

        self.ip_addr = ip_addr
        self.ext_ip_addr = ext_ip_addr
        self.scheduler_port = scheduler_port
        self.pulse_interval = None
        self.keyfile = keyfile
        self.certfile = certfile
        if self.keyfile:
            self.keyfile = os.path.abspath(self.keyfile)
        if self.certfile:
            self.certfile = os.path.abspath(self.certfile)

        self.asyncoro = AsynCoro()

        self.tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        if self.certfile:
            self.tcp_sock = ssl.wrap_socket(self.tcp_sock, keyfile=self.keyfile,
                                            certfile=self.certfile)
        self.tcp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.tcp_sock.bind((self.ip_addr, node_port))
        self.address = self.tcp_sock.getsockname()
        self.tcp_sock.listen(30)

        if dest_path_prefix:
            self.dest_path_prefix = dest_path_prefix.strip().rstrip(os.sep)
        else:
            self.dest_path_prefix = os.path.join(os.sep, 'tmp', 'dispy')
        if not os.path.isdir(self.dest_path_prefix):
            os.makedirs(self.dest_path_prefix)
            os.chmod(self.dest_path_prefix, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
        if max_file_size is None:
            max_file_size = MaxFileSize
        self.max_file_size = max_file_size

        self.avail_cpus = self.cpus
        self.computations = {}
        self.scheduler_ip_addr = None
        self.file_uses = {}
        self.job_infos = {}
        self.lock = asyncoro.Lock()
        self.terminate = False
        self.signature = os.urandom(20).encode('hex')
        self.auth_code = hashlib.sha1(self.signature + secret).hexdigest()
        self.zombie_interval = 60 * zombie_interval

        logger.debug('auth_code for %s: %s', ip_addr, self.auth_code)

        self.udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        self.udp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        self.udp_sock.bind(('', node_port))
        logger.info('serving %s cpus at %s:%s', self.cpus, self.ip_addr, node_port)
        logger.debug('tcp server at %s:%s', self.address[0], self.address[1])
        self.udp_sock = AsynCoroSocket(self.udp_sock, blocking=False)

        scheduler_ip_addr = _node_ipaddr(scheduler_node)

        self.reply_Q = multiprocessing.Queue()
        self.reply_Q_thread = threading.Thread(target=self.__reply_Q)
        self.reply_Q_thread.start()

        self.timer_coro = Coro(self.timer_task)
        # self.tcp_coro = Coro(self.tcp_server)
        self.udp_coro = Coro(self.udp_server, scheduler_ip_addr)

    def send_pong_msg(self, coro=None):
        ping_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        ping_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
        ping_sock = AsynCoroSocket(ping_sock, blocking=False)
        pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1],
                    'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version}
        pong_msg = 'PONG:' + serialize(pong_msg)
        yield ping_sock.sendto(pong_msg, ('<broadcast>', self.scheduler_port))
        ping_sock.close()

    def udp_server(self, scheduler_ip_addr, coro=None):
        assert coro is not None
        coro.set_daemon()
        if self.avail_cpus == self.cpus:
            yield self.send_pong_msg(coro=coro)
        pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1],
                    'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version}
        pong_msg = 'PONG:' + serialize(pong_msg)

        if scheduler_ip_addr:
            sock = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM))
            try:
                yield sock.sendto(pong_msg, (scheduler_ip_addr, self.scheduler_port))
            except:
                logger.warning("Couldn't send ping message to %s:%s",
                               scheduler_ip_addr, self.scheduler_port)
            finally:
                sock.close()

        while True:
            msg, addr = yield self.udp_sock.recvfrom(1024)
            # TODO: process each message as separate Coro, so
            # exceptions are contained?
            if msg.startswith('PING:'):
                if self.cpus != self.avail_cpus:
                    logger.debug('Busy (%s/%s); ignoring ping message from %s',
                                 self.cpus, self.avail_cpus, addr[0])
                    continue
                try:
                    info = unserialize(msg[len('PING:'):])
                    socket.inet_aton(info['scheduler_ip_addr'])
                    assert isinstance(info['scheduler_port'], int)
                    assert info['version'] == _dispy_version
                    addr = (info['scheduler_ip_addr'], info['scheduler_port'])
                except:
                    # raise
                    logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1])
                    continue
                yield self.udp_sock.sendto(pong_msg, addr)
            elif msg.startswith('PULSE:'):
                try:
                    info = unserialize(msg[len('PULSE:'):])
                    assert info['ip_addr'] == self.scheduler_ip_addr
                    yield self.lock.acquire()
                    for compute in self.computations.itervalues():
                        compute.last_pulse = time.time()
                    yield self.lock.release()
                except:
                    logger.warning('Ignoring PULSE from %s', addr[0])
            elif msg.startswith('SERVERPORT:'):
                try:
                    req = unserialize(msg[len('SERVERPORT:'):])
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    reply = {'ip_addr':self.address[0], 'port':self.address[1],
                             'sign':self.signature, 'version':_dispy_version}
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    yield sock.sendto(serialize(reply), (req['ip_addr'], req['port']))
                    sock.close()
                except:
                    logger.debug(traceback.format_exc())
                    # pass
            else:
                logger.warning('Ignoring ping message from %s', addr[0])

    def tcp_serve_task(self, conn, addr, coro=None):
        conn = AsynCoroSocket(conn, blocking=False,
                              keyfile=self.keyfile, certfile=self.certfile)
        def job_request_task(msg):
            assert coro is not None
            try:
                _job = unserialize(msg)
            except:
                logger.debug('Ignoring job request from %s', addr[0])
                logger.debug(traceback.format_exc())
                raise StopIteration
            yield self.lock.acquire()
            compute = self.computations.get(_job.compute_id, None)
            if compute is not None:
                if compute.scheduler_ip_addr != self.scheduler_ip_addr:
                    compute = None
            yield self.lock.release()
            if self.avail_cpus == 0:
                logger.warning('All cpus busy')
                try:
                    yield conn.send_msg('NAK (all cpus busy)')
                except:
                    pass
                raise StopIteration
            elif compute is None:
                logger.warning('Invalid computation %s', _job.compute_id)
                try:
                    yield conn.send_msg('NAK (invalid computation %s)' % _job.compute_id)
                except:
                    pass
                raise StopIteration

            reply_addr = (compute.scheduler_ip_addr, compute.job_result_port)
            logger.debug('New job id %s from %s', _job.uid, addr[0])
            files = []
            for f in _job.files:
                tgt = os.path.join(compute.dest_path, os.path.basename(f['name']))
                try:
                    fd = open(tgt, 'wb')
                    fd.write(f['data'])
                    fd.close()
                except:
                    logger.warning('Could not save file "%s"', tgt)
                    continue
                try:
                    os.utime(tgt, (f['stat'].st_atime, f['stat'].st_mtime))
                    os.chmod(tgt, stat.S_IMODE(f['stat'].st_mode))
                except:
                    logger.debug('Could not set modes for "%s"', tgt)
                files.append(tgt)
            _job.files = files

            if compute.type == _Compute.func_type:
                reply = _JobReply(_job, self.ext_ip_addr)
                job_info = _DispyJobInfo(reply, reply_addr, compute)
                args = (job_info, self.certfile, self.keyfile,
                        _job.args, _job.kwargs, self.reply_Q,
                        compute.name, compute.code, compute.dest_path, _job.files)
                try:
                    yield conn.send_msg('ACK')
                except:
                    logger.warning('Failed to send response for new job to %s', str(addr))
                    raise StopIteration
                job_info.job_reply.status = DispyJob.Running
                job_info.proc = multiprocessing.Process(target=_dispy_job_func, args=args)
                yield self.lock.acquire()
                self.avail_cpus -= 1
                compute.pending_jobs += 1
                self.job_infos[_job.uid] = job_info
                self.lock.release()
                job_info.proc.start()
                raise StopIteration
            elif compute.type == _Compute.prog_type:
                try:
                    yield conn.send_msg('ACK')
                except:
                    logger.warning('Failed to send response for new job to %s', str(addr))
                    raise StopIteration
                reply = _JobReply(_job, self.ext_ip_addr)
                job_info = _DispyJobInfo(reply, reply_addr, compute)
                job_info.job_reply.status = DispyJob.Running
                yield self.lock.acquire()
                self.job_infos[_job.uid] = job_info
                self.avail_cpus -= 1
                compute.pending_jobs += 1
                yield self.lock.release()
                prog_thread = threading.Thread(target=self.__job_program, args=(_job, job_info))
                prog_thread.start()
                raise StopIteration
            else:
                try:
                    yield conn.send_msg('NAK (invalid computation type "%s")' % compute.type)
                except:
                    logger.warning('Failed to send response for new job to %s', str(addr))

        def add_computation_task(msg):
            assert coro is not None
            try:
                compute = unserialize(msg)
            except:
                logger.debug('Ignoring computation request from %s', addr[0])
                try:
                    yield conn.send_msg('Invalid computation request')
                except:
                    logger.warning('Failed to send reply to %s', str(addr))
                raise StopIteration
            yield self.lock.acquire()
            if not ((self.scheduler_ip_addr is None) or
                    (self.scheduler_ip_addr == compute.scheduler_ip_addr and \
                     self.scheduler_port == compute.scheduler_port)):
                logger.debug('Ignoring computation request from %s: %s, %s, %s',
                             compute.scheduler_ip_addr, self.scheduler_ip_addr,
                             self.avail_cpus, self.cpus)
                self.lock.release()
                try:
                    yield conn.send_msg('Busy')
                except:
                    pass
                raise StopIteration

            resp = 'ACK'
            if compute.dest_path and isinstance(compute.dest_path, str):
                compute.dest_path = compute.dest_path.strip(os.sep)
            else:
                for x in xrange(20):
                    compute.dest_path = os.urandom(8).encode('hex')
                    if compute.dest_path.find(os.sep) >= 0:
                        continue
                    if not os.path.isdir(os.path.join(self.dest_path_prefix, compute.dest_path)):
                        break
                else:
                    logger.warning('Failed to create unique dest_path: %s', compute.dest_path)
                    resp = 'NACK'
            compute.dest_path = os.path.join(self.dest_path_prefix, compute.dest_path)
            try:
                os.makedirs(compute.dest_path)
                os.chmod(compute.dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR)
                logger.debug('dest_path for "%s": %s', compute.name, compute.dest_path)
            except:
                logger.warning('Invalid destination path: "%s"', compute.dest_path)
                if os.path.isdir(compute.dest_path):
                    os.rmdir(compute.dest_path)
                self.lock.release()
                try:
                    yield conn.send_msg('NACK (Invalid dest_path)')
                except:
                    logger.warning('Failed to send reply to %s', str(addr))
                raise StopIteration
            if compute.id in self.computations:
                logger.warning('Computation "%s" (%s) is being replaced',
                               compute.name, compute.id)
            setattr(compute, 'last_pulse', time.time())
            setattr(compute, 'pending_jobs', 0)
            setattr(compute, 'pending_results', 0)
            setattr(compute, 'zombie', False)
            logger.debug('xfer_files given: %s', ','.join(xf.name for xf in compute.xfer_files))
            if compute.type == _Compute.func_type:
                try:
                    code = compile(compute.code, '<string>', 'exec')
                except:
                    logger.warning('Computation "%s" could not be compiled', compute.name)
                    if os.path.isdir(compute.dest_path):
                        os.rmdir(compute.dest_path)
                    self.lock.release()
                    try:
                        yield conn.send_msg('NACK (Compilation failed)')
                    except:
                        logger.warning('Failed to send reply to %s', str(addr))
                    raise StopIteration
                compute.code = marshal.dumps(code)
            elif compute.type == _Compute.prog_type:
                assert not compute.code
                compute.name = os.path.join(compute.dest_path, os.path.basename(compute.name))

            xfer_files = []
            for xf in compute.xfer_files:
                tgt = os.path.join(compute.dest_path, os.path.basename(xf.name))
                try:
                    if _same_file(tgt, xf):
                        logger.debug('Ignoring file "%s" / "%s"', xf.name, tgt)
                        if tgt not in self.file_uses:
                            self.file_uses[tgt] = 0
                        self.file_uses[tgt] += 1
                        continue
                except:
                    pass
                if self.max_file_size and xf.stat_buf.st_size > self.max_file_size:
                    resp = 'NACK (file "%s" too big)' % xf.name
                else:
                    xfer_files.append(xf)
            if resp == 'ACK' and ((self.scheduler_ip_addr is not None) and \
                                  (self.scheduler_ip_addr != compute.scheduler_ip_addr)):
                resp = 'NACK (busy)'
            if resp == 'ACK':
                self.computations[compute.id] = compute
                self.scheduler_ip_addr = compute.scheduler_ip_addr
                self.scheduler_port = compute.scheduler_port
                self.pulse_interval = compute.pulse_interval
                self.lock.release()
                if xfer_files:
                    resp += ':XFER_FILES:' + serialize(xfer_files)
                try:
                    yield conn.send_msg(resp)
                except:
                    assert self.scheduler_ip_addr == compute.scheduler_ip_addr
                    yield self.lock.acquire()
                    del self.computations[compute.id]
                    self.scheduler_ip_addr = None
                    self.scheduler_port = None
                    self.pulse_interval = None
                    self.lock.release()
                else:
                    self.timer_coro.resume(True)
            else:
                self.lock.release()
                if os.path.isdir(compute.dest_path):
                    os.rmdir(compute.dest_path)
                try:
                    yield conn.send_msg(resp)
                except:
                    pass

        def xfer_file_task(msg):
            assert coro is not None
            try:
                xf = unserialize(msg)
            except:
                logger.debug('Ignoring file trasnfer request from %s', addr[0])
                raise StopIteration
            resp = ''
            if xf.compute_id not in self.computations:
                logger.error('computation "%s" is invalid' % xf.compute_id)
                raise StopIteration
            tgt = os.path.join(self.computations[xf.compute_id].dest_path,
                               os.path.basename(xf.name))
            if os.path.isfile(tgt):
                if _same_file(tgt, xf):
                    yield self.lock.acquire()
                    if tgt in self.file_uses:
                        self.file_uses[tgt] += 1
                    else:
                        self.file_uses[tgt] = 1
                    yield self.lock.release()
                    resp = 'ACK'
                else:
                    logger.warning('File "%s" already exists with different status as "%s"',
                                   xf.name, tgt)
            if not resp:
                logger.debug('Copying file %s to %s (%s)', xf.name, tgt, xf.stat_buf.st_size)
                try:
                    fd = open(tgt, 'wb')
                    n = 0
                    while n < xf.stat_buf.st_size:
                        data = yield conn.recvall(min(xf.stat_buf.st_size-n, 10240000))
                        if not data:
                            break
                        fd.write(data)
                        n += len(data)
                        if self.max_file_size and n > self.max_file_size:
                            logger.warning('File "%s" is too big (%s); it is truncated', tgt, n)
                            break
                    fd.close()
                    if n < xf.stat_buf.st_size:
                        resp = 'NAK (read only %s bytes)' % n
                    else:
                        resp = 'ACK'
                        logger.debug('Copied file %s, %s', tgt, resp)
                        os.utime(tgt, (xf.stat_buf.st_atime, xf.stat_buf.st_mtime))
                        os.chmod(tgt, stat.S_IMODE(xf.stat_buf.st_mode))
                        self.file_uses[tgt] = 1
                except:
                    logger.warning('Copying file "%s" failed with "%s"',
                                   xf.name, traceback.format_exc())
                    resp = 'NACK'
                try:
                    yield conn.send_msg(resp)
                except:
                    logger.debug('Could not send reply for "%s"', xf.name)
            raise StopIteration # xfer_file_task

        def terminate_job_task(msg):
            assert coro is not None
            yield self.lock.acquire()
            try:
                _job = unserialize(msg)
                compute = self.computations[_job.compute_id]
                assert addr[0] == compute.scheduler_ip_addr
                job_info = self.job_infos.pop(_job.uid, None)
            except:
                logger.debug('Ignoring job request from %s', addr[0])
                raise StopIteration
            finally:
                self.lock.release()
            if job_info is None:
                logger.debug('Job %s completed; ignoring cancel request from %s',
                             _job.uid, addr[0])
                raise StopIteration
            logger.debug('Terminating job %s', _job.uid)
            job_info.proc.terminate()
            if isinstance(job_info.proc, multiprocessing.Process):
                for x in xrange(20):
                    if job_info.proc.is_alive():
                        yield coro.sleep(0.1)
                    else:
                        logger.debug('Process "%s" for job %s terminated', compute.name, _job.uid)
                        break
                else:
                    logger.warning('Could not kill process %s', compute.name)
                    raise StopIteration
            else:
                assert isinstance(job_info.proc, subprocess.Popen)
                for x in xrange(20):
                    rc = job_info.proc.poll()
                    logger.debug('Program "%s" for job %s terminated with %s',
                                 compute.name, _job.uid, rc)
                    if rc is not None:
                        break
                    if x == 10:
                        logger.debug('Killing job %s', _job.uid)
                        job_info.proc.kill()
                    yield coro.sleep(0.1)
                else:
                    logger.warning('Could not kill process %s', compute.name)
                    raise StopIteration
            reply_addr = (addr[0], compute.job_result_port)
            reply = _JobReply(_job, self.ext_ip_addr)
            job_info = _DispyJobInfo(reply, reply_addr, compute)
            reply.status = DispyJob.Terminated
            yield self._send_job_reply(job_info, resending=False, coro=coro)

        def retrieve_job_task(msg):
            assert coro is not None
            try:
                req = unserialize(msg)
                assert req['uid'] is not None
                assert req['hash'] is not None
                assert req['compute_id'] is not None
            except:
                resp = serialize('Invalid job')
                try:
                    yield conn.send_msg(resp)
                except:
                    pass
                raise StopIteration

            job_info = self.job_infos.get(req['uid'], None)
            resp = None
            if job_info is not None:
                try:
                    yield conn.send_msg(serialize(job_info.job_reply))
                    ack = yield conn.recv_msg()
                    # no need to check ack
                except:
                    logger.debug('Could not send reply for job %s', req['uid'])
                raise StopIteration

            for d in os.listdir(self.dest_path_prefix):
                info_file = os.path.join(self.dest_path_prefix, d,
                                         '_dispy_job_reply_%s' % req['uid'])
                if os.path.isfile(info_file):
                    try:
                        fd = open(info_file, 'rb')
                        job_reply = pickle.load(fd)
                        fd.close()
                    except:
                        job_reply = None
                    if hasattr(job_reply, 'hash') and job_reply.hash == req['hash']:
                        try:
                            yield conn.send_msg(serialize(job_reply))
                            ack = yield conn.recv_msg()
                            assert ack == 'ACK'
                        except:
                            logger.debug('Could not send reply for job %s', req['uid'])
                            raise StopIteration
                        try:
                            os.remove(info_file)
                            yield self.lock.acquire()
                            compute = self.computations.get(req['compute_id'], None)
                            if compute is not None:
                                compute.pending_results -= 1
                                if compute.pending_results == 0:
                                    compute.zombie = True
                                    self.cleanup_computation(compute)
                            self.lock.release()
                        except:
                            logger.debug('Could not remove "%s"', info_file)
                        raise StopIteration
            else:
                resp = serialize('Invalid job: %s' % req['uid'])

            if resp:
                try:
                    yield conn.send_msg(resp)
                except:
                    pass

        # tcp_serve_task starts
        try:
            req = yield conn.recvall(len(self.auth_code))
            assert req == self.auth_code
        except:
            logger.warning('Ignoring request; invalid client authentication?')
            conn.close()
            raise StopIteration
        msg = yield conn.recv_msg()
        if not msg:
            conn.close()
            raise StopIteration
        if msg.startswith('JOB:'):
            msg = msg[len('JOB:'):]
            yield job_request_task(msg)
            conn.close()
        elif msg.startswith('COMPUTE:'):
            msg = msg[len('COMPUTE:'):]
            yield add_computation_task(msg)
            conn.close()
        elif msg.startswith('FILEXFER:'):
            msg = msg[len('FILEXFER:'):]
            yield xfer_file_task(msg)
            conn.close()
        elif msg.startswith('DEL_COMPUTE:'):
            msg = msg[len('DEL_COMPUTE:'):]
            try:
                info = unserialize(msg)
                compute_id = info['ID']
                yield self.lock.acquire()
                compute = self.computations.get(compute_id, None)
                if compute is None:
                    logger.warning('Computation "%s" is not valid', compute_id)
                else:
                    compute.zombie = True
                    self.cleanup_computation(compute)
                self.lock.release()
            except:
                logger.debug('Deleting computation failed with %s', traceback.format_exc())
                # raise
            conn.close()
        elif msg.startswith('TERMINATE_JOB:'):
            msg = msg[len('TERMINATE_JOB:'):]
            yield terminate_job_task(msg)
            conn.close()
        elif msg.startswith('RETRIEVE_JOB:'):
            msg = msg[len('RETRIEVE_JOB:'):]
            yield retrieve_job_task(msg)
            conn.close()
        else:
            logger.warning('Invalid request "%s" from %s',
                           msg[:min(10, len(msg))], addr[0])
            resp = 'NAK (invalid command: %s)' % (msg[:min(10, len(msg))])
            try:
                yield conn.send_msg(resp)
            except:
                logger.warning('Failed to send reply to %s', str(addr))
            conn.close()

    def timer_task(self, coro=None):
        coro.set_daemon()
        reset = True
        last_pulse_time = last_zombie_time = time.time()
        while True:
            if reset:
                if self.pulse_interval and self.zombie_interval:
                    timeout = min(self.pulse_interval, self.zombie_interval)
                    self.zombie_interval = max(5 * self.pulse_interval, self.zombie_interval)
                else:
                    timeout = max(self.pulse_interval, self.zombie_interval)
                    self.zombie_interval = self.zombie_interval

            reset = yield coro.suspend(timeout)

            now = time.time()
            if self.pulse_interval and (now - last_pulse_time) >= self.pulse_interval:
                n = self.cpus - self.avail_cpus
                assert n >= 0
                if n > 0 and self.scheduler_ip_addr:
                    last_pulse_time = now
                    msg = 'PULSE:' + serialize({'ip_addr':self.ext_ip_addr,
                                                'port':self.udp_sock.getsockname()[1], 'cpus':n})
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    yield sock.sendto(msg, (self.scheduler_ip_addr, self.scheduler_port))
                    sock.close()
            if self.zombie_interval and (now - last_zombie_time) >= self.zombie_interval:
                last_zombie_time = now
                yield self.lock.acquire()
                for compute in self.computations.itervalues():
                    if (now - compute.last_pulse) > self.zombie_interval:
                        compute.zombie = True
                zombies = [compute for compute in self.computations.itervalues() \
                           if compute.zombie and compute.pending_jobs == 0]
                for compute in zombies:
                    logger.debug('Deleting zombie computation "%s"', compute.name)
                    self.cleanup_computation(compute)
                phoenix = [compute for compute in self.computations.itervalues() \
                           if not compute.zombie and compute.pending_results]
                for compute in phoenix:
                    files = [f for f in os.listdir(compute.dest_path) \
                             if f.startswith('_dispy_job_reply_')]
                    # limit number queued so as not to take up too much time
                    files = files[:min(len(files), 128)]
                    for f in files:
                        result_file = os.path.join(compute.dest_path, f)
                        try:
                            fd = open(result_file, 'rb')
                            job_result = pickle.load(fd)
                            fd.close()
                        except:
                            logger.debug('Could not load "%s"', result_file)
                            logger.debug(traceback.format_exc())
                            continue
                        try:
                            os.remove(result_file)
                        except:
                            logger.debug('Could not remove "%s"', result_file)
                        compute.pending_results -= 1
                        job_info = _DispyJobInfo(job_result, (compute.scheduler_ip_addr,
                                                              compute.job_result_port), compute)
                        Coro(self._send_job_reply, job_info, resending=True)
                self.lock.release()
                for compute in zombies:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr)
                    data = serialize({'ip_addr':self.address[0], 'port':self.address[1],
                                      'sign':self.signature})
                    yield sock.sendto('TERMINATED:%s' % data, (compute.scheduler_ip_addr,
                                                               compute.scheduler_port))
                    sock.close()
                if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus:
                    self.pulse_interval = None
                    reset = True
                    yield self.send_pong_msg(coro=coro)

    def __job_program(self, _job, job_info):
        compute = self.computations[_job.compute_id]
        program = [compute.name]
        args = unserialize(_job.args)
        program.extend(args)
        logger.debug('Executing "%s"', str(program))
        reply = job_info.job_reply
        try:
            os.chdir(compute.dest_path)
            env = {}
            env.update(os.environ)
            env['PATH'] = compute.dest_path + ':' + env['PATH']
            job_info.proc = subprocess.Popen(program, stdout=subprocess.PIPE,
                                             stderr=subprocess.PIPE, env=env)

            assert isinstance(job_info.proc, subprocess.Popen)
            reply.stdout, reply.stderr = job_info.proc.communicate()
            reply.result = job_info.proc.returncode
            reply.status = DispyJob.Finished
        except:
            logger.debug('Executing %s failed with %s', str(program), str(sys.exc_info()))
            reply.exception = traceback.format_exc()
            reply.status = DispyJob.Terminated
        self.reply_Q.put(reply)

    def __reply_Q(self):
        while True:
            job_reply = self.reply_Q.get()
            if job_reply is None:
                break
            job_info = self.job_infos.pop(job_reply.uid, None)
            if job_info is not None:
                if job_info.proc is not None:
                    if isinstance(job_info.proc, multiprocessing.Process):
                        job_info.proc.join(2)
                    else:
                        job_info.proc.wait()
                job_info.job_reply = job_reply
                Coro(self._send_job_reply, job_info, resending=False).value()

    def _send_job_reply(self, job_info, resending=False, coro=None):
        """Internal use only.
        """
        assert coro is not None
        job_reply = job_info.job_reply
        logger.debug('Sending result for job %s (%s) to %s',
                     job_reply.uid, job_reply.status, str(job_info.reply_addr))
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        sock = AsynCoroSocket(sock, blocking=False, certfile=self.certfile, keyfile=self.keyfile)
        sock.settimeout(2)
        try:
            yield sock.connect(job_info.reply_addr)
            yield sock.send_msg(serialize(job_reply))
            ack = yield sock.recv_msg()
            assert ack == 'ACK'
        except:
            logger.error("Couldn't send results for %s to %s",
                         job_reply.uid, str(job_info.reply_addr))
            # store job result even if computation has not enabled
            # fault recovery; user may be able to access node and
            # retrieve result manually
            f = os.path.join(job_info.compute_dest_path, '_dispy_job_reply_%s' % job_reply.uid)
            logger.debug('storing results for job %s', job_reply.uid)
            try:
                fd = open(f, 'wb')
                pickle.dump(job_reply, fd)
                fd.close()
            except:
                logger.debug('Could not save results for job %s', job_reply.uid)
            else:
                yield self.lock.acquire()
                compute = self.computations.get(job_info.compute_id, None)
                if compute is not None:
                    compute.pending_results += 1
                self.lock.release()
        finally:
            sock.close()
            if not resending:
                yield self.lock.acquire()
                self.avail_cpus += 1
                compute = self.computations.get(job_info.compute_id, None)
                if compute is None:
                    logger.warning('Computation for %s / %s is invalid!',
                                   job_reply.uid, job_info.compute_id)
                else:
                    # technically last_pulse should be updated only
                    # when successfully sent reply, but no harm if done
                    # otherwise, too
                    compute.last_pulse = time.time()
                    compute.pending_jobs -= 1
                    if compute.pending_jobs == 0 and compute.zombie:
                        self.cleanup_computation(compute)
                self.lock.release()

    def cleanup_computation(self, compute):
        # called with lock held
        if not compute.zombie:
            return
        if compute.pending_jobs != 0:
            logger.debug('pending jobs for computation "%s"/%s: %s',
                         compute.name, compute.id, compute.pending_jobs)
            if compute.pending_jobs > 0:
                return

        del self.computations[compute.id]
        if compute.scheduler_ip_addr == self.scheduler_ip_addr and \
               all(c.scheduler_ip_addr != self.scheduler_ip_addr \
                   for c in self.computations.itervalues()):
            assert self.avail_cpus == self.cpus
            self.scheduler_ip_addr = None
            self.pulse_interval = None

        if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus:
            self.timer_coro.resume(True)
            Coro(self.send_pong_msg)
        if compute.cleanup is False:
            return
        for xf in compute.xfer_files:
            tgt = os.path.join(compute.dest_path, os.path.basename(xf.name))
            if tgt not in self.file_uses:
                logger.debug('File "%s" is unknown', tgt)
                continue
            self.file_uses[tgt] -= 1
            if self.file_uses[tgt] == 0:
                del self.file_uses[tgt]
                if tgt == xf:
                    logger.debug('Not removing file "%s"', xf.name)
                else:
                    logger.debug('Removing file "%s"', tgt)
                    try:
                        os.remove(tgt)
                        if os.path.splitext(tgt)[1] == '.py' and os.path.isfile(tgt + 'c'):
                            os.remove(tgt + 'c')
                    except:
                        logger.warning('Could not remove file "%s"', tgt)

        if os.path.isdir(compute.dest_path) and \
               compute.dest_path.startswith(self.dest_path_prefix) and \
               len(compute.dest_path) > len(self.dest_path_prefix) and \
               len(os.listdir(compute.dest_path)) == 0:
            logger.debug('Removing "%s"', compute.dest_path)
            try:
                os.rmdir(compute.dest_path)
            except:
                logger.warning('Could not remove directory "%s"', compute.dest_path)

    def shutdown(self):
        def _shutdown(self, coro=None):
            assert coro is not None
            yield self.lock.acquire()
            job_infos = self.job_infos
            self.job_infos = {}
            computations = self.computations.items()
            self.computations = {}
            if self.reply_Q:
                self.reply_Q.put(None)
            self.lock.release()
            for uid, job_info in job_infos.iteritems():
                job_info.proc.terminate()
                logger.debug('process for %s is killed', uid)
                if isinstance(job_info.proc, multiprocessing.Process):
                    job_info.proc.join(2)
                else:
                    job_info.proc.wait()
            for cid, compute in computations:
                sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                sock = AsynCoroSocket(sock, blocking=False)
                sock.settimeout(2)
                logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr)
                data = serialize({'ip_addr':self.address[0], 'port':self.address[1],
                                  'sign':self.signature})
                yield sock.sendto('TERMINATED:' + data, (compute.scheduler_ip_addr,
                                                         compute.scheduler_port))
                sock.close()

        Coro(_shutdown, self).value()
        self.asyncoro.join()
        self.asyncoro.terminate()