Beispiel #1
0
 def _shutdown(self, coro=None):
     assert coro is not None
     yield self.lock.acquire()
     job_infos = self.job_infos
     self.job_infos = {}
     computations = self.computations.items()
     self.computations = {}
     if self.reply_Q:
         self.reply_Q.put(None)
     self.lock.release()
     for uid, job_info in job_infos.iteritems():
         job_info.proc.terminate()
         logger.debug('process for %s is killed', uid)
         if isinstance(job_info.proc, multiprocessing.Process):
             job_info.proc.join(2)
         else:
             job_info.proc.wait()
     for cid, compute in computations:
         sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         sock = AsynCoroSocket(sock, blocking=False)
         sock.settimeout(2)
         logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr)
         data = serialize({'ip_addr':self.address[0], 'port':self.address[1],
                           'sign':self.signature})
         yield sock.sendto('TERMINATED:' + data, (compute.scheduler_ip_addr,
                                                  compute.scheduler_port))
         sock.close()
Beispiel #2
0
def dispy_provisional_result(result):
    """Sends provisional result of computation back to the client.

    In some cases, such as optimizations, computations may send
    current (best) result to the client and continue computation (for
    next iteration) so that the client may decide to terminate
    computations based on the results or alter computations if
    necessary. The computations can use this function in such cases
    with the current result of computation as argument.
    """

    __dispy_job_reply = __dispy_job_info.job_reply
    logger.debug('Sending provisional result for job %s to %s',
                 __dispy_job_reply.uid, __dispy_job_info.reply_addr)
    __dispy_job_reply.status = DispyJob.ProvisionalResult
    __dispy_job_reply.result = result
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    sock = AsynCoroSocket(sock, blocking=True, keyfile=__dispy_job_keyfile,
                          certfile=__dispy_job_certfile)
    sock.settimeout(2)
    try:
        sock.connect(__dispy_job_info.reply_addr)
        sock.send_msg(serialize(__dispy_job_reply))
        ack = sock.recv_msg()
    except:
        logger.warning("Couldn't send provisional results %s:\n%s",
                       str(result), traceback.format_exc())
    sock.close()
Beispiel #3
0
 def _send_job_reply(self, job_info, resending=False, coro=None):
     """Internal use only.
     """
     assert coro is not None
     job_reply = job_info.job_reply
     logger.debug('Sending result for job %s (%s) to %s',
                  job_reply.uid, job_reply.status, str(job_info.reply_addr))
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     sock = AsynCoroSocket(sock, blocking=False, certfile=self.certfile, keyfile=self.keyfile)
     sock.settimeout(2)
     try:
         yield sock.connect(job_info.reply_addr)
         yield sock.send_msg(serialize(job_reply))
         ack = yield sock.recv_msg()
         assert ack == 'ACK'
     except:
         logger.error("Couldn't send results for %s to %s",
                      job_reply.uid, str(job_info.reply_addr))
         # store job result even if computation has not enabled
         # fault recovery; user may be able to access node and
         # retrieve result manually
         f = os.path.join(job_info.compute_dest_path, '_dispy_job_reply_%s' % job_reply.uid)
         logger.debug('storing results for job %s', job_reply.uid)
         try:
             fd = open(f, 'wb')
             pickle.dump(job_reply, fd)
             fd.close()
         except:
             logger.debug('Could not save results for job %s', job_reply.uid)
         else:
             yield self.lock.acquire()
             compute = self.computations.get(job_info.compute_id, None)
             if compute is not None:
                 compute.pending_results += 1
             self.lock.release()
     finally:
         sock.close()
         if not resending:
             yield self.lock.acquire()
             self.avail_cpus += 1
             compute = self.computations.get(job_info.compute_id, None)
             if compute is None:
                 logger.warning('Computation for %s / %s is invalid!',
                                job_reply.uid, job_info.compute_id)
             else:
                 # technically last_pulse should be updated only
                 # when successfully sent reply, but no harm if done
                 # otherwise, too
                 compute.last_pulse = time.time()
                 compute.pending_jobs -= 1
                 if compute.pending_jobs == 0 and compute.zombie:
                     self.cleanup_computation(compute)
             self.lock.release()
Beispiel #4
0
 def _send_job_reply(self, job_info, resending=False, coro=None):
     """Internal use only.
     """
     assert coro is not None
     job_reply = job_info.job_reply
     logger.debug('Sending result for job %s (%s) to %s',
                  job_reply.uid, job_reply.status, str(job_info.reply_addr))
     if not resending:
         self.avail_cpus += 1
         assert self.avail_cpus <= self.num_cpus
     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     sock = AsynCoroSocket(sock, blocking=False, certfile=self.certfile, keyfile=self.keyfile)
     sock.settimeout(5)
     try:
         yield sock.connect(job_info.reply_addr)
         yield sock.send_msg(serialize(job_reply))
         ack = yield sock.recv_msg()
         assert ack == 'ACK'
         compute = self.computations.get(job_info.compute_id, None)
         if compute is not None:
             compute.last_pulse = time.time()
     except:
         logger.error("Couldn't send results for %s to %s : %s",
                      job_reply.uid, str(job_info.reply_addr), traceback.format_exc())
         # store job result even if computation has not enabled
         # fault recovery; user may be able to access node and
         # retrieve result manually
         f = os.path.join(job_info.compute_dest_path, '_dispy_job_reply_%s' % job_reply.uid)
         logger.debug('storing results for job %s', job_reply.uid)
         try:
             fd = open(f, 'wb')
             pickle.dump(job_reply, fd)
             fd.close()
         except:
             logger.debug('Could not save results for job %s', job_reply.uid)
         else:
             compute = self.computations.get(job_info.compute_id, None)
             if compute is not None:
                 compute.pending_results += 1
     finally:
         sock.close()
         if not resending:
             compute = self.computations.get(job_info.compute_id, None)
             if compute is not None:
                 compute.pending_jobs -= 1
                 if compute.pending_jobs == 0 and compute.zombie:
                     self.cleanup_computation(compute)
Beispiel #5
0
    def timer_task(self, coro=None):
        coro.set_daemon()
        reset = True
        last_pulse_time = last_zombie_time = time.time()
        while True:
            if reset:
                if self.pulse_interval and self.zombie_interval:
                    timeout = min(self.pulse_interval, self.zombie_interval)
                    self.zombie_interval = max(5 * self.pulse_interval, self.zombie_interval)
                else:
                    timeout = max(self.pulse_interval, self.zombie_interval)
                    self.zombie_interval = self.zombie_interval

            reset = yield coro.suspend(timeout)

            now = time.time()
            if self.pulse_interval and (now - last_pulse_time) >= self.pulse_interval:
                n = self.cpus - self.avail_cpus
                assert n >= 0
                if n > 0 and self.scheduler_ip_addr:
                    last_pulse_time = now
                    msg = 'PULSE:' + serialize({'ip_addr':self.ext_ip_addr,
                                                'port':self.udp_sock.getsockname()[1], 'cpus':n})
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    yield sock.sendto(msg, (self.scheduler_ip_addr, self.scheduler_port))
                    sock.close()
            if self.zombie_interval and (now - last_zombie_time) >= self.zombie_interval:
                last_zombie_time = now
                yield self.lock.acquire()
                for compute in self.computations.itervalues():
                    if (now - compute.last_pulse) > self.zombie_interval:
                        compute.zombie = True
                zombies = [compute for compute in self.computations.itervalues() \
                           if compute.zombie and compute.pending_jobs == 0]
                for compute in zombies:
                    logger.debug('Deleting zombie computation "%s"', compute.name)
                    self.cleanup_computation(compute)
                phoenix = [compute for compute in self.computations.itervalues() \
                           if not compute.zombie and compute.pending_results]
                for compute in phoenix:
                    files = [f for f in os.listdir(compute.dest_path) \
                             if f.startswith('_dispy_job_reply_')]
                    # limit number queued so as not to take up too much time
                    files = files[:min(len(files), 128)]
                    for f in files:
                        result_file = os.path.join(compute.dest_path, f)
                        try:
                            fd = open(result_file, 'rb')
                            job_result = pickle.load(fd)
                            fd.close()
                        except:
                            logger.debug('Could not load "%s"', result_file)
                            logger.debug(traceback.format_exc())
                            continue
                        try:
                            os.remove(result_file)
                        except:
                            logger.debug('Could not remove "%s"', result_file)
                        compute.pending_results -= 1
                        job_info = _DispyJobInfo(job_result, (compute.scheduler_ip_addr,
                                                              compute.job_result_port), compute)
                        Coro(self._send_job_reply, job_info, resending=True)
                self.lock.release()
                for compute in zombies:
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr)
                    data = serialize({'ip_addr':self.address[0], 'port':self.address[1],
                                      'sign':self.signature})
                    yield sock.sendto('TERMINATED:%s' % data, (compute.scheduler_ip_addr,
                                                               compute.scheduler_port))
                    sock.close()
                if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus:
                    self.pulse_interval = None
                    reset = True
                    yield self.send_pong_msg(coro=coro)
Beispiel #6
0
    def udp_server(self, scheduler_ip_addr, coro=None):
        assert coro is not None
        coro.set_daemon()
        if self.avail_cpus == self.cpus:
            yield self.send_pong_msg(coro=coro)
        pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1],
                    'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version}
        pong_msg = 'PONG:' + serialize(pong_msg)

        if scheduler_ip_addr:
            sock = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM))
            try:
                yield sock.sendto(pong_msg, (scheduler_ip_addr, self.scheduler_port))
            except:
                logger.warning("Couldn't send ping message to %s:%s",
                               scheduler_ip_addr, self.scheduler_port)
            finally:
                sock.close()

        while True:
            msg, addr = yield self.udp_sock.recvfrom(1024)
            # TODO: process each message as separate Coro, so
            # exceptions are contained?
            if msg.startswith('PING:'):
                if self.cpus != self.avail_cpus:
                    logger.debug('Busy (%s/%s); ignoring ping message from %s',
                                 self.cpus, self.avail_cpus, addr[0])
                    continue
                try:
                    info = unserialize(msg[len('PING:'):])
                    socket.inet_aton(info['scheduler_ip_addr'])
                    assert isinstance(info['scheduler_port'], int)
                    assert info['version'] == _dispy_version
                    addr = (info['scheduler_ip_addr'], info['scheduler_port'])
                except:
                    # raise
                    logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1])
                    continue
                yield self.udp_sock.sendto(pong_msg, addr)
            elif msg.startswith('PULSE:'):
                try:
                    info = unserialize(msg[len('PULSE:'):])
                    assert info['ip_addr'] == self.scheduler_ip_addr
                    yield self.lock.acquire()
                    for compute in self.computations.itervalues():
                        compute.last_pulse = time.time()
                    yield self.lock.release()
                except:
                    logger.warning('Ignoring PULSE from %s', addr[0])
            elif msg.startswith('SERVERPORT:'):
                try:
                    req = unserialize(msg[len('SERVERPORT:'):])
                    sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
                    reply = {'ip_addr':self.address[0], 'port':self.address[1],
                             'sign':self.signature, 'version':_dispy_version}
                    sock = AsynCoroSocket(sock, blocking=False)
                    sock.settimeout(1)
                    yield sock.sendto(serialize(reply), (req['ip_addr'], req['port']))
                    sock.close()
                except:
                    logger.debug(traceback.format_exc())
                    # pass
            else:
                logger.warning('Ignoring ping message from %s', addr[0])