def _dispy_job_func(__dispy_job_info, __dispy_job_certfile, __dispy_job_keyfile, __dispy_job_args, __dispy_job_kwargs, __dispy_reply_Q, __dispy_job_name, __dispy_job_code, __dispy_path, __dispy_job_files=[]): """Internal use only. """ os.chdir(__dispy_path) sys.stdout = io.StringIO() sys.stderr = io.StringIO() __dispy_job_reply = __dispy_job_info.job_reply sys.path = [__dispy_path] + sys.path try: exec(marshal.loads(__dispy_job_code)) globals().update(locals()) __dispy_job_args = unserialize(__dispy_job_args) __dispy_job_kwargs = unserialize(__dispy_job_kwargs) __func = globals()[__dispy_job_name] __dispy_job_reply.result = __func(*__dispy_job_args, **__dispy_job_kwargs) __dispy_job_reply.status = DispyJob.Finished except: __dispy_job_reply.exception = traceback.format_exc() __dispy_job_reply.status = DispyJob.Terminated for f in __dispy_job_files: if os.path.isfile(f): try: os.remove(f) except: logger.debug('Could not remove "%s"', f) __dispy_job_reply.stdout = sys.stdout.getvalue() __dispy_job_reply.stderr = sys.stderr.getvalue() signal.signal(signal.SIGTERM, signal.SIG_IGN) __dispy_reply_Q.put(__dispy_job_reply)
def sched_udp_proc(self, coro=None): coro.set_daemon() sched_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) sched_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sched_sock.bind(('', self.scheduler_port)) while 1: msg, addr = yield sched_sock.recvfrom(1024) if (not msg.startswith('PING:'.encode()) or not self.scheduler_ip_addrs or not self.scheduler_port): logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) logger.debug('sched_sock: %s', info) assert info['version'] == __version__ # assert isinstance(info['cpus'], int) except: logger.debug(traceback.format_exc()) msg = {'ip_addrs': self.scheduler_ip_addrs, 'port': self.scheduler_port, 'version': __version__} if info.get('relay', None): logger.debug('Ignoring ping back from %s: %s', addr[0], info) continue msg['relay'] = 'y' relay_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) yield relay_sock.sendto('PING:'.encode() + asyncoro.serialize(msg), (info['ip_addr'], info['port'])) relay_sock.close()
def sched_udp_proc(self, coro=None): coro.set_daemon() sched_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) sched_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sched_sock.bind(('', self.scheduler_port)) while 1: msg, addr = yield sched_sock.recvfrom(1024) if (not msg.startswith('PING:'.encode()) or not self.scheduler_ip_addrs or not self.scheduler_port): logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) logger.debug('sched_sock: %s' % info) assert info['version'] == __version__ # assert isinstance(info['cpus'], int) except: logger.debug(traceback.format_exc()) msg = { 'ip_addrs': self.scheduler_ip_addrs, 'port': self.scheduler_port, 'version': __version__ } if info.get('relay', None): logger.debug('Ignoring ping back from %s: %s', addr[0], info) continue msg['relay'] = 'y' relay_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) yield relay_sock.sendto('PING:'.encode() + asyncoro.serialize(msg), (info['ip_addr'], info['port'])) relay_sock.close()
def listen_udp_proc(self, coro=None): coro.set_daemon() bc_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) if self.scheduler_ip_addrs and self.scheduler_port: relay_request = { 'ip_addrs': self.scheduler_ip_addrs, 'port': self.scheduler_port, 'version': __version__, 'sign': None } bc_sock.sendto( 'PING:'.encode() + asyncoro.serialize(relay_request), ('<broadcast>', self.node_port)) bc_sock.close() listen_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) listen_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) listen_sock.bind(('', self.listen_port)) while 1: msg, addr = yield listen_sock.recvfrom(1024) if not msg.startswith('PING:'.encode()): logger.debug('Ignoring message "%s" from %s', msg[:min(len(msg), 5)], addr[0]) continue logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) if info['version'] != __version__: logger.warning( 'Ignoring %s due to version mismatch: %s / %s', info['ip_addrs'], info['version'], __version__) continue self.scheduler_ip_addrs = info['ip_addrs'] + [addr[0]] self.scheduler_port = info['port'] except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) continue if info.get('relay', None): logger.debug('Ignoring ping back (from %s)', addr[0]) continue logger.debug('relaying ping from %s / %s' % (info['ip_addrs'], addr[0])) if self.node_port == self.listen_port: info[ 'relay'] = 'y' # 'check if this message loops back to self bc_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) yield bc_sock.sendto('PING:'.encode() + asyncoro.serialize(info), ('<broadcast>', self.node_port)) bc_sock.close()
def xfer_file_task(msg): assert coro is not None try: xf = unserialize(msg) except: logger.debug('Ignoring file trasnfer request from %s', addr[0]) raise StopIteration resp = '' if xf.compute_id not in self.computations: logger.error('computation "%s" is invalid' % xf.compute_id) raise StopIteration tgt = os.path.join(self.computations[xf.compute_id].dest_path, os.path.basename(xf.name)) if os.path.isfile(tgt): if _same_file(tgt, xf): yield self.lock.acquire() if tgt in self.file_uses: self.file_uses[tgt] += 1 else: self.file_uses[tgt] = 1 yield self.lock.release() resp = 'ACK' else: logger.warning('File "%s" already exists with different status as "%s"', xf.name, tgt) if not resp: logger.debug('Copying file %s to %s (%s)', xf.name, tgt, xf.stat_buf.st_size) try: fd = open(tgt, 'wb') n = 0 while n < xf.stat_buf.st_size: data = yield conn.recvall(min(xf.stat_buf.st_size-n, 10240000)) if not data: break fd.write(data) n += len(data) if self.max_file_size and n > self.max_file_size: logger.warning('File "%s" is too big (%s); it is truncated', tgt, n) break fd.close() if n < xf.stat_buf.st_size: resp = 'NAK (read only %s bytes)' % n else: resp = 'ACK' logger.debug('Copied file %s, %s', tgt, resp) os.utime(tgt, (xf.stat_buf.st_atime, xf.stat_buf.st_mtime)) os.chmod(tgt, stat.S_IMODE(xf.stat_buf.st_mode)) self.file_uses[tgt] = 1 except: logger.warning('Copying file "%s" failed with "%s"', xf.name, traceback.format_exc()) resp = 'NACK' try: yield conn.send_msg(resp) except: logger.debug('Could not send reply for "%s"', xf.name) raise StopIteration # xfer_file_task
def terminate_job_task(msg): assert coro is not None yield self.lock.acquire() try: _job = unserialize(msg) compute = self.computations[_job.compute_id] assert addr[0] == compute.scheduler_ip_addr job_info = self.job_infos.pop(_job.uid, None) except: logger.debug('Ignoring job request from %s', addr[0]) raise StopIteration finally: self.lock.release() if job_info is None: logger.debug('Job %s completed; ignoring cancel request from %s', _job.uid, addr[0]) raise StopIteration logger.debug('Terminating job %s', _job.uid) job_info.proc.terminate() if isinstance(job_info.proc, multiprocessing.Process): for x in xrange(20): if job_info.proc.is_alive(): yield coro.sleep(0.1) else: logger.debug('Process "%s" for job %s terminated', compute.name, _job.uid) break else: logger.warning('Could not kill process %s', compute.name) raise StopIteration else: assert isinstance(job_info.proc, subprocess.Popen) for x in xrange(20): rc = job_info.proc.poll() logger.debug('Program "%s" for job %s terminated with %s', compute.name, _job.uid, rc) if rc is not None: break if x == 10: logger.debug('Killing job %s', _job.uid) job_info.proc.kill() yield coro.sleep(0.1) else: logger.warning('Could not kill process %s', compute.name) raise StopIteration reply_addr = (addr[0], compute.job_result_port) reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) reply.status = DispyJob.Terminated yield self._send_job_reply(job_info, resending=False, coro=coro)
def listen_udp_proc(self, coro=None): coro.set_daemon() bc_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) if self.scheduler_ip_addrs and self.scheduler_port: relay_request = {'ip_addrs': self.scheduler_ip_addrs, 'port': self.scheduler_port, 'version': __version__, 'sign': None} bc_sock.sendto('PING:'.encode() + asyncoro.serialize(relay_request), ('<broadcast>', self.node_port)) bc_sock.close() listen_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) listen_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) listen_sock.bind(('', self.listen_port)) while 1: msg, addr = yield listen_sock.recvfrom(1024) if not msg.startswith('PING:'.encode()): logger.debug('Ignoring message "%s" from %s', msg[:min(len(msg), 5)], addr[0]) continue logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) if info['version'] != __version__: logger.warning('Ignoring %s due to version mismatch: %s / %s', info['ip_addrs'], info['version'], __version__) continue self.scheduler_ip_addrs = info['ip_addrs'] + [addr[0]] self.scheduler_port = info['port'] except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) continue if info.get('relay', None): logger.debug('Ignoring ping back (from %s)', addr[0]) continue logger.debug('relaying ping from %s / %s', info['ip_addrs'], addr[0]) if self.node_port == self.listen_port: info['relay'] = 'y' # 'check if this message loops back to self bc_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) yield bc_sock.sendto('PING:'.encode() + asyncoro.serialize(info), ('<broadcast>', self.node_port)) bc_sock.close()
def tcp_task(conn, addr, coro=None): conn.settimeout(5) try: msg = yield conn.recvall(auth_len) msg = yield conn.recv_msg() except: logger.debug(traceback.format_exc()) logger.debug('Ignoring invalid TCP message from %s:%s' % (addr[0], addr[1])) raise StopIteration finally: conn.close() logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) if info['version'] != __version__: logger.warning( 'Ignoring %s due to version mismatch: %s / %s', info['ip_addrs'], info['version'], __version__) raise StopIteration # TODO: since dispynetrelay is not aware of computations # closing, if more than one client sends ping, nodes will # respond to different clients self.scheduler_ip_addrs = info['ip_addrs'] + [addr[0]] self.scheduler_port = info['port'] except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) raise StopIteration if info.get('relay', None): logger.debug('Ignoring ping back (from %s)', addr[0]) raise StopIteration logger.debug('relaying ping from %s / %s' % (info['ip_addrs'], addr[0])) if self.node_port == self.listen_port: info[ 'relay'] = 'y' # 'check if this message loops back to self bc_sock = asyncoro.AsyncSocket( socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) yield bc_sock.sendto('PING:'.encode() + asyncoro.serialize(info), ('<broadcast>', self.node_port)) bc_sock.close()
def tcp_task(conn, addr, coro=None): conn.settimeout(5) try: msg = yield conn.recvall(auth_len) msg = yield conn.recv_msg() except: logger.debug(traceback.format_exc()) logger.debug('Ignoring invalid TCP message from %s:%s', addr[0], addr[1]) raise StopIteration finally: conn.close() logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = asyncoro.unserialize(msg[len('PING:'.encode()):]) if info['version'] != __version__: logger.warning('Ignoring %s due to version mismatch: %s / %s', info['ip_addrs'], info['version'], __version__) raise StopIteration # TODO: since dispynetrelay is not aware of computations # closing, if more than one client sends ping, nodes will # respond to different clients self.scheduler_ip_addrs = info['ip_addrs'] + [addr[0]] self.scheduler_port = info['port'] except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) raise StopIteration if info.get('relay', None): logger.debug('Ignoring ping back (from %s)', addr[0]) raise StopIteration logger.debug('relaying ping from %s / %s', info['ip_addrs'], addr[0]) if self.node_port == self.listen_port: info['relay'] = 'y' # 'check if this message loops back to self bc_sock = asyncoro.AsyncSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) yield bc_sock.sendto('PING:'.encode() + asyncoro.serialize(info), ('<broadcast>', self.node_port)) bc_sock.close()
def __job_program(self, _job, job_info): compute = self.computations[_job.compute_id] program = [compute.name] args = unserialize(_job.args) program.extend(args) logger.debug('Executing "%s"', str(program)) reply = job_info.job_reply try: os.chdir(compute.dest_path) env = {} env.update(os.environ) env['PATH'] = compute.dest_path + ':' + env['PATH'] job_info.proc = subprocess.Popen(program, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) assert isinstance(job_info.proc, subprocess.Popen) reply.stdout, reply.stderr = job_info.proc.communicate() reply.result = job_info.proc.returncode reply.status = DispyJob.Finished except: logger.debug('Executing %s failed with %s', str(program), str(sys.exc_info())) reply.exception = traceback.format_exc() reply.status = DispyJob.Terminated self.reply_Q.put(reply)
def relay_pings(self, ip_addr='', netmask=None, node_port=51348, scheduler_node=None, scheduler_port=51347): netaddr = None if not netmask: try: ip_addr, bits = ip_addr.split('/') socket.inet_aton(ip_addr) netmask = (0xffffffff << (32 - int(bits))) & 0xffffffff netaddr = (struct.unpack('>L', socket.inet_aton(ip_addr))[0]) & netmask except: netmask = '255.255.255.255' if ip_addr: socket.inet_aton(ip_addr) else: ip_addr = socket.gethostbyname(socket.gethostname()) if not netaddr and netmask: try: if isinstance(netmask, str): netmask = struct.unpack('>L', socket.inet_aton(netmask))[0] else: assert isinstance(netmask, int) assert netmask > 0 netaddr = (struct.unpack('>L', socket.inet_aton(ip_addr))[0]) & netmask except: logger.warning('Invalid netmask') try: socket.inet_ntoa(struct.pack('>L', netaddr)) socket.inet_ntoa(struct.pack('>L', netmask)) except: netaddr = netmask = None scheduler_version = _dispy_version bc_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) bc_sock.bind(('', 0)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) scheduler_ip_addr = _node_ipaddr(scheduler_node) if scheduler_ip_addr and scheduler_port: relay_request = serialize({'ip_addr':scheduler_ip_addr, 'port':scheduler_port, 'version':_dispy_version, 'sign':None}) bc_sock.sendto('PING:%s' % relay_request, ('<broadcast>', node_port)) node_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) node_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) node_sock.bind(('', node_port)) sched_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sched_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sched_sock.bind(('', scheduler_port)) logger.info('Listening on %s:%s/%s', ip_addr, node_port, scheduler_port) while True: ready = select.select([node_sock, sched_sock], [], [])[0] for sock in ready: if sock == node_sock: msg, addr = node_sock.recvfrom(1024) if not msg.startswith('PING:'): logger.debug('Ignoring message "%s" from %s', msg[:min(len(msg), 5)], addr[0]) continue if netaddr and (struct.unpack('>L', socket.inet_aton(addr[0]))[0] & netmask) == netaddr: logger.debug('Ignoring own ping (from %s)', addr[0]) continue logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = unserialize(msg[len('PING:'):]) scheduler_ip_addr = info['ip_addr'] scheduler_port = info['port'] assert info['version'] == _dispy_version # scheduler_sign = info['sign'] assert isinstance(scheduler_port, int) except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) continue logger.debug('relaying ping from %s / %s' % (info['ip_addr'], addr[0])) if scheduler_ip_addr is None: info['ip_addr'] = scheduler_ip_addr = addr[0] relay_request = serialize(info) bc_sock.sendto('PING:%s' % relay_request, ('<broadcast>', node_port)) else: assert sock == sched_sock msg, addr = sched_sock.recvfrom(1024) if msg.startswith('PING:') and scheduler_ip_addr and scheduler_port: try: info = unserialize(msg[len('PONG:'):]) assert info['version'] == _dispy_version assert isinstance(info['ip_addr'], str) assert isinstance(info['port'], int) # assert isinstance(info['cpus'], int) info['scheduler_ip_addr'] = scheduler_ip_addr relay_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) relay_sock.sendto('PING:' + serialize(info), (scheduler_ip_addr, scheduler_port)) relay_sock.close() except: logger.debug(traceback.format_exc()) # raise logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1])
def retrieve_job_task(msg): assert coro is not None try: req = unserialize(msg) assert req['uid'] is not None assert req['hash'] is not None assert req['compute_id'] is not None except: resp = serialize('Invalid job') try: yield conn.send_msg(resp) except: pass raise StopIteration job_info = self.job_infos.get(req['uid'], None) resp = None if job_info is not None: try: yield conn.send_msg(serialize(job_info.job_reply)) ack = yield conn.recv_msg() # no need to check ack except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration for d in os.listdir(self.dest_path_prefix): info_file = os.path.join(self.dest_path_prefix, d, '_dispy_job_reply_%s' % req['uid']) if os.path.isfile(info_file): try: fd = open(info_file, 'rb') job_reply = pickle.load(fd) fd.close() except: job_reply = None if hasattr(job_reply, 'hash') and job_reply.hash == req['hash']: try: yield conn.send_msg(serialize(job_reply)) ack = yield conn.recv_msg() assert ack == 'ACK' except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration try: os.remove(info_file) yield self.lock.acquire() compute = self.computations.get(req['compute_id'], None) if compute is not None: compute.pending_results -= 1 if compute.pending_results == 0: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Could not remove "%s"', info_file) raise StopIteration else: resp = serialize('Invalid job: %s' % req['uid']) if resp: try: yield conn.send_msg(resp) except: pass
def add_computation_task(msg): assert coro is not None try: compute = unserialize(msg) except: logger.debug('Ignoring computation request from %s', addr[0]) try: yield conn.send_msg('Invalid computation request') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration yield self.lock.acquire() if not ((self.scheduler_ip_addr is None) or (self.scheduler_ip_addr == compute.scheduler_ip_addr and \ self.scheduler_port == compute.scheduler_port)): logger.debug('Ignoring computation request from %s: %s, %s, %s', compute.scheduler_ip_addr, self.scheduler_ip_addr, self.avail_cpus, self.cpus) self.lock.release() try: yield conn.send_msg('Busy') except: pass raise StopIteration resp = 'ACK' if compute.dest_path and isinstance(compute.dest_path, str): compute.dest_path = compute.dest_path.strip(os.sep) else: for x in xrange(20): compute.dest_path = os.urandom(8).encode('hex') if compute.dest_path.find(os.sep) >= 0: continue if not os.path.isdir(os.path.join(self.dest_path_prefix, compute.dest_path)): break else: logger.warning('Failed to create unique dest_path: %s', compute.dest_path) resp = 'NACK' compute.dest_path = os.path.join(self.dest_path_prefix, compute.dest_path) try: os.makedirs(compute.dest_path) os.chmod(compute.dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) logger.debug('dest_path for "%s": %s', compute.name, compute.dest_path) except: logger.warning('Invalid destination path: "%s"', compute.dest_path) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Invalid dest_path)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration if compute.id in self.computations: logger.warning('Computation "%s" (%s) is being replaced', compute.name, compute.id) setattr(compute, 'last_pulse', time.time()) setattr(compute, 'pending_jobs', 0) setattr(compute, 'pending_results', 0) setattr(compute, 'zombie', False) logger.debug('xfer_files given: %s', ','.join(xf.name for xf in compute.xfer_files)) if compute.type == _Compute.func_type: try: code = compile(compute.code, '<string>', 'exec') except: logger.warning('Computation "%s" could not be compiled', compute.name) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Compilation failed)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration compute.code = marshal.dumps(code) elif compute.type == _Compute.prog_type: assert not compute.code compute.name = os.path.join(compute.dest_path, os.path.basename(compute.name)) xfer_files = [] for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) try: if _same_file(tgt, xf): logger.debug('Ignoring file "%s" / "%s"', xf.name, tgt) if tgt not in self.file_uses: self.file_uses[tgt] = 0 self.file_uses[tgt] += 1 continue except: pass if self.max_file_size and xf.stat_buf.st_size > self.max_file_size: resp = 'NACK (file "%s" too big)' % xf.name else: xfer_files.append(xf) if resp == 'ACK' and ((self.scheduler_ip_addr is not None) and \ (self.scheduler_ip_addr != compute.scheduler_ip_addr)): resp = 'NACK (busy)' if resp == 'ACK': self.computations[compute.id] = compute self.scheduler_ip_addr = compute.scheduler_ip_addr self.scheduler_port = compute.scheduler_port self.pulse_interval = compute.pulse_interval self.lock.release() if xfer_files: resp += ':XFER_FILES:' + serialize(xfer_files) try: yield conn.send_msg(resp) except: assert self.scheduler_ip_addr == compute.scheduler_ip_addr yield self.lock.acquire() del self.computations[compute.id] self.scheduler_ip_addr = None self.scheduler_port = None self.pulse_interval = None self.lock.release() else: self.timer_coro.resume(True) else: self.lock.release() if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) try: yield conn.send_msg(resp) except: pass
def job_request_task(msg): assert coro is not None try: _job = unserialize(msg) except: logger.debug('Ignoring job request from %s', addr[0]) logger.debug(traceback.format_exc()) raise StopIteration yield self.lock.acquire() compute = self.computations.get(_job.compute_id, None) if compute is not None: if compute.scheduler_ip_addr != self.scheduler_ip_addr: compute = None yield self.lock.release() if self.avail_cpus == 0: logger.warning('All cpus busy') try: yield conn.send_msg('NAK (all cpus busy)') except: pass raise StopIteration elif compute is None: logger.warning('Invalid computation %s', _job.compute_id) try: yield conn.send_msg('NAK (invalid computation %s)' % _job.compute_id) except: pass raise StopIteration reply_addr = (compute.scheduler_ip_addr, compute.job_result_port) logger.debug('New job id %s from %s', _job.uid, addr[0]) files = [] for f in _job.files: tgt = os.path.join(compute.dest_path, os.path.basename(f['name'])) try: fd = open(tgt, 'wb') fd.write(f['data']) fd.close() except: logger.warning('Could not save file "%s"', tgt) continue try: os.utime(tgt, (f['stat'].st_atime, f['stat'].st_mtime)) os.chmod(tgt, stat.S_IMODE(f['stat'].st_mode)) except: logger.debug('Could not set modes for "%s"', tgt) files.append(tgt) _job.files = files if compute.type == _Compute.func_type: reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) args = (job_info, self.certfile, self.keyfile, _job.args, _job.kwargs, self.reply_Q, compute.name, compute.code, compute.dest_path, _job.files) try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration job_info.job_reply.status = DispyJob.Running job_info.proc = multiprocessing.Process(target=_dispy_job_func, args=args) yield self.lock.acquire() self.avail_cpus -= 1 compute.pending_jobs += 1 self.job_infos[_job.uid] = job_info self.lock.release() job_info.proc.start() raise StopIteration elif compute.type == _Compute.prog_type: try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) job_info.job_reply.status = DispyJob.Running yield self.lock.acquire() self.job_infos[_job.uid] = job_info self.avail_cpus -= 1 compute.pending_jobs += 1 yield self.lock.release() prog_thread = threading.Thread(target=self.__job_program, args=(_job, job_info)) prog_thread.start() raise StopIteration else: try: yield conn.send_msg('NAK (invalid computation type "%s")' % compute.type) except: logger.warning('Failed to send response for new job to %s', str(addr))
def tcp_serve_task(self, conn, addr, coro=None): conn = AsynCoroSocket(conn, blocking=False, keyfile=self.keyfile, certfile=self.certfile) def job_request_task(msg): assert coro is not None try: _job = unserialize(msg) except: logger.debug('Ignoring job request from %s', addr[0]) logger.debug(traceback.format_exc()) raise StopIteration yield self.lock.acquire() compute = self.computations.get(_job.compute_id, None) if compute is not None: if compute.scheduler_ip_addr != self.scheduler_ip_addr: compute = None yield self.lock.release() if self.avail_cpus == 0: logger.warning('All cpus busy') try: yield conn.send_msg('NAK (all cpus busy)') except: pass raise StopIteration elif compute is None: logger.warning('Invalid computation %s', _job.compute_id) try: yield conn.send_msg('NAK (invalid computation %s)' % _job.compute_id) except: pass raise StopIteration reply_addr = (compute.scheduler_ip_addr, compute.job_result_port) logger.debug('New job id %s from %s', _job.uid, addr[0]) files = [] for f in _job.files: tgt = os.path.join(compute.dest_path, os.path.basename(f['name'])) try: fd = open(tgt, 'wb') fd.write(f['data']) fd.close() except: logger.warning('Could not save file "%s"', tgt) continue try: os.utime(tgt, (f['stat'].st_atime, f['stat'].st_mtime)) os.chmod(tgt, stat.S_IMODE(f['stat'].st_mode)) except: logger.debug('Could not set modes for "%s"', tgt) files.append(tgt) _job.files = files if compute.type == _Compute.func_type: reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) args = (job_info, self.certfile, self.keyfile, _job.args, _job.kwargs, self.reply_Q, compute.name, compute.code, compute.dest_path, _job.files) try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration job_info.job_reply.status = DispyJob.Running job_info.proc = multiprocessing.Process(target=_dispy_job_func, args=args) yield self.lock.acquire() self.avail_cpus -= 1 compute.pending_jobs += 1 self.job_infos[_job.uid] = job_info self.lock.release() job_info.proc.start() raise StopIteration elif compute.type == _Compute.prog_type: try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) job_info.job_reply.status = DispyJob.Running yield self.lock.acquire() self.job_infos[_job.uid] = job_info self.avail_cpus -= 1 compute.pending_jobs += 1 yield self.lock.release() prog_thread = threading.Thread(target=self.__job_program, args=(_job, job_info)) prog_thread.start() raise StopIteration else: try: yield conn.send_msg('NAK (invalid computation type "%s")' % compute.type) except: logger.warning('Failed to send response for new job to %s', str(addr)) def add_computation_task(msg): assert coro is not None try: compute = unserialize(msg) except: logger.debug('Ignoring computation request from %s', addr[0]) try: yield conn.send_msg('Invalid computation request') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration yield self.lock.acquire() if not ((self.scheduler_ip_addr is None) or (self.scheduler_ip_addr == compute.scheduler_ip_addr and \ self.scheduler_port == compute.scheduler_port)): logger.debug('Ignoring computation request from %s: %s, %s, %s', compute.scheduler_ip_addr, self.scheduler_ip_addr, self.avail_cpus, self.cpus) self.lock.release() try: yield conn.send_msg('Busy') except: pass raise StopIteration resp = 'ACK' if compute.dest_path and isinstance(compute.dest_path, str): compute.dest_path = compute.dest_path.strip(os.sep) else: for x in xrange(20): compute.dest_path = os.urandom(8).encode('hex') if compute.dest_path.find(os.sep) >= 0: continue if not os.path.isdir(os.path.join(self.dest_path_prefix, compute.dest_path)): break else: logger.warning('Failed to create unique dest_path: %s', compute.dest_path) resp = 'NACK' compute.dest_path = os.path.join(self.dest_path_prefix, compute.dest_path) try: os.makedirs(compute.dest_path) os.chmod(compute.dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) logger.debug('dest_path for "%s": %s', compute.name, compute.dest_path) except: logger.warning('Invalid destination path: "%s"', compute.dest_path) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Invalid dest_path)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration if compute.id in self.computations: logger.warning('Computation "%s" (%s) is being replaced', compute.name, compute.id) setattr(compute, 'last_pulse', time.time()) setattr(compute, 'pending_jobs', 0) setattr(compute, 'pending_results', 0) setattr(compute, 'zombie', False) logger.debug('xfer_files given: %s', ','.join(xf.name for xf in compute.xfer_files)) if compute.type == _Compute.func_type: try: code = compile(compute.code, '<string>', 'exec') except: logger.warning('Computation "%s" could not be compiled', compute.name) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Compilation failed)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration compute.code = marshal.dumps(code) elif compute.type == _Compute.prog_type: assert not compute.code compute.name = os.path.join(compute.dest_path, os.path.basename(compute.name)) xfer_files = [] for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) try: if _same_file(tgt, xf): logger.debug('Ignoring file "%s" / "%s"', xf.name, tgt) if tgt not in self.file_uses: self.file_uses[tgt] = 0 self.file_uses[tgt] += 1 continue except: pass if self.max_file_size and xf.stat_buf.st_size > self.max_file_size: resp = 'NACK (file "%s" too big)' % xf.name else: xfer_files.append(xf) if resp == 'ACK' and ((self.scheduler_ip_addr is not None) and \ (self.scheduler_ip_addr != compute.scheduler_ip_addr)): resp = 'NACK (busy)' if resp == 'ACK': self.computations[compute.id] = compute self.scheduler_ip_addr = compute.scheduler_ip_addr self.scheduler_port = compute.scheduler_port self.pulse_interval = compute.pulse_interval self.lock.release() if xfer_files: resp += ':XFER_FILES:' + serialize(xfer_files) try: yield conn.send_msg(resp) except: assert self.scheduler_ip_addr == compute.scheduler_ip_addr yield self.lock.acquire() del self.computations[compute.id] self.scheduler_ip_addr = None self.scheduler_port = None self.pulse_interval = None self.lock.release() else: self.timer_coro.resume(True) else: self.lock.release() if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) try: yield conn.send_msg(resp) except: pass def xfer_file_task(msg): assert coro is not None try: xf = unserialize(msg) except: logger.debug('Ignoring file trasnfer request from %s', addr[0]) raise StopIteration resp = '' if xf.compute_id not in self.computations: logger.error('computation "%s" is invalid' % xf.compute_id) raise StopIteration tgt = os.path.join(self.computations[xf.compute_id].dest_path, os.path.basename(xf.name)) if os.path.isfile(tgt): if _same_file(tgt, xf): yield self.lock.acquire() if tgt in self.file_uses: self.file_uses[tgt] += 1 else: self.file_uses[tgt] = 1 yield self.lock.release() resp = 'ACK' else: logger.warning('File "%s" already exists with different status as "%s"', xf.name, tgt) if not resp: logger.debug('Copying file %s to %s (%s)', xf.name, tgt, xf.stat_buf.st_size) try: fd = open(tgt, 'wb') n = 0 while n < xf.stat_buf.st_size: data = yield conn.recvall(min(xf.stat_buf.st_size-n, 10240000)) if not data: break fd.write(data) n += len(data) if self.max_file_size and n > self.max_file_size: logger.warning('File "%s" is too big (%s); it is truncated', tgt, n) break fd.close() if n < xf.stat_buf.st_size: resp = 'NAK (read only %s bytes)' % n else: resp = 'ACK' logger.debug('Copied file %s, %s', tgt, resp) os.utime(tgt, (xf.stat_buf.st_atime, xf.stat_buf.st_mtime)) os.chmod(tgt, stat.S_IMODE(xf.stat_buf.st_mode)) self.file_uses[tgt] = 1 except: logger.warning('Copying file "%s" failed with "%s"', xf.name, traceback.format_exc()) resp = 'NACK' try: yield conn.send_msg(resp) except: logger.debug('Could not send reply for "%s"', xf.name) raise StopIteration # xfer_file_task def terminate_job_task(msg): assert coro is not None yield self.lock.acquire() try: _job = unserialize(msg) compute = self.computations[_job.compute_id] assert addr[0] == compute.scheduler_ip_addr job_info = self.job_infos.pop(_job.uid, None) except: logger.debug('Ignoring job request from %s', addr[0]) raise StopIteration finally: self.lock.release() if job_info is None: logger.debug('Job %s completed; ignoring cancel request from %s', _job.uid, addr[0]) raise StopIteration logger.debug('Terminating job %s', _job.uid) job_info.proc.terminate() if isinstance(job_info.proc, multiprocessing.Process): for x in xrange(20): if job_info.proc.is_alive(): yield coro.sleep(0.1) else: logger.debug('Process "%s" for job %s terminated', compute.name, _job.uid) break else: logger.warning('Could not kill process %s', compute.name) raise StopIteration else: assert isinstance(job_info.proc, subprocess.Popen) for x in xrange(20): rc = job_info.proc.poll() logger.debug('Program "%s" for job %s terminated with %s', compute.name, _job.uid, rc) if rc is not None: break if x == 10: logger.debug('Killing job %s', _job.uid) job_info.proc.kill() yield coro.sleep(0.1) else: logger.warning('Could not kill process %s', compute.name) raise StopIteration reply_addr = (addr[0], compute.job_result_port) reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) reply.status = DispyJob.Terminated yield self._send_job_reply(job_info, resending=False, coro=coro) def retrieve_job_task(msg): assert coro is not None try: req = unserialize(msg) assert req['uid'] is not None assert req['hash'] is not None assert req['compute_id'] is not None except: resp = serialize('Invalid job') try: yield conn.send_msg(resp) except: pass raise StopIteration job_info = self.job_infos.get(req['uid'], None) resp = None if job_info is not None: try: yield conn.send_msg(serialize(job_info.job_reply)) ack = yield conn.recv_msg() # no need to check ack except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration for d in os.listdir(self.dest_path_prefix): info_file = os.path.join(self.dest_path_prefix, d, '_dispy_job_reply_%s' % req['uid']) if os.path.isfile(info_file): try: fd = open(info_file, 'rb') job_reply = pickle.load(fd) fd.close() except: job_reply = None if hasattr(job_reply, 'hash') and job_reply.hash == req['hash']: try: yield conn.send_msg(serialize(job_reply)) ack = yield conn.recv_msg() assert ack == 'ACK' except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration try: os.remove(info_file) yield self.lock.acquire() compute = self.computations.get(req['compute_id'], None) if compute is not None: compute.pending_results -= 1 if compute.pending_results == 0: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Could not remove "%s"', info_file) raise StopIteration else: resp = serialize('Invalid job: %s' % req['uid']) if resp: try: yield conn.send_msg(resp) except: pass # tcp_serve_task starts try: req = yield conn.recvall(len(self.auth_code)) assert req == self.auth_code except: logger.warning('Ignoring request; invalid client authentication?') conn.close() raise StopIteration msg = yield conn.recv_msg() if not msg: conn.close() raise StopIteration if msg.startswith('JOB:'): msg = msg[len('JOB:'):] yield job_request_task(msg) conn.close() elif msg.startswith('COMPUTE:'): msg = msg[len('COMPUTE:'):] yield add_computation_task(msg) conn.close() elif msg.startswith('FILEXFER:'): msg = msg[len('FILEXFER:'):] yield xfer_file_task(msg) conn.close() elif msg.startswith('DEL_COMPUTE:'): msg = msg[len('DEL_COMPUTE:'):] try: info = unserialize(msg) compute_id = info['ID'] yield self.lock.acquire() compute = self.computations.get(compute_id, None) if compute is None: logger.warning('Computation "%s" is not valid', compute_id) else: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Deleting computation failed with %s', traceback.format_exc()) # raise conn.close() elif msg.startswith('TERMINATE_JOB:'): msg = msg[len('TERMINATE_JOB:'):] yield terminate_job_task(msg) conn.close() elif msg.startswith('RETRIEVE_JOB:'): msg = msg[len('RETRIEVE_JOB:'):] yield retrieve_job_task(msg) conn.close() else: logger.warning('Invalid request "%s" from %s', msg[:min(10, len(msg))], addr[0]) resp = 'NAK (invalid command: %s)' % (msg[:min(10, len(msg))]) try: yield conn.send_msg(resp) except: logger.warning('Failed to send reply to %s', str(addr)) conn.close()
def udp_server(self, scheduler_ip_addr, coro=None): assert coro is not None coro.set_daemon() if self.avail_cpus == self.cpus: yield self.send_pong_msg(coro=coro) pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1], 'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version} pong_msg = 'PONG:' + serialize(pong_msg) if scheduler_ip_addr: sock = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) try: yield sock.sendto(pong_msg, (scheduler_ip_addr, self.scheduler_port)) except: logger.warning("Couldn't send ping message to %s:%s", scheduler_ip_addr, self.scheduler_port) finally: sock.close() while True: msg, addr = yield self.udp_sock.recvfrom(1024) # TODO: process each message as separate Coro, so # exceptions are contained? if msg.startswith('PING:'): if self.cpus != self.avail_cpus: logger.debug('Busy (%s/%s); ignoring ping message from %s', self.cpus, self.avail_cpus, addr[0]) continue try: info = unserialize(msg[len('PING:'):]) socket.inet_aton(info['scheduler_ip_addr']) assert isinstance(info['scheduler_port'], int) assert info['version'] == _dispy_version addr = (info['scheduler_ip_addr'], info['scheduler_port']) except: # raise logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue yield self.udp_sock.sendto(pong_msg, addr) elif msg.startswith('PULSE:'): try: info = unserialize(msg[len('PULSE:'):]) assert info['ip_addr'] == self.scheduler_ip_addr yield self.lock.acquire() for compute in self.computations.itervalues(): compute.last_pulse = time.time() yield self.lock.release() except: logger.warning('Ignoring PULSE from %s', addr[0]) elif msg.startswith('SERVERPORT:'): try: req = unserialize(msg[len('SERVERPORT:'):]) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) reply = {'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature, 'version':_dispy_version} sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) yield sock.sendto(serialize(reply), (req['ip_addr'], req['port'])) sock.close() except: logger.debug(traceback.format_exc()) # pass else: logger.warning('Ignoring ping message from %s', addr[0])
def relay_pings(self, ip_addr='', netmask=None, node_port=51348, scheduler_node=None, scheduler_port=51347): netaddr = None if not netmask: try: ip_addr, bits = ip_addr.split('/') socket.inet_aton(ip_addr) netmask = (0xffffffff << (32 - int(bits))) & 0xffffffff netaddr = (struct.unpack( '>L', socket.inet_aton(ip_addr))[0]) & netmask except: netmask = '255.255.255.255' if ip_addr: socket.inet_aton(ip_addr) else: ip_addr = socket.gethostbyname(socket.gethostname()) if not netaddr and netmask: try: if isinstance(netmask, str): netmask = struct.unpack('>L', socket.inet_aton(netmask))[0] else: assert isinstance(netmask, int) assert netmask > 0 netaddr = (struct.unpack( '>L', socket.inet_aton(ip_addr))[0]) & netmask except: logger.warning('Invalid netmask') try: socket.inet_ntoa(struct.pack('>L', netaddr)) socket.inet_ntoa(struct.pack('>L', netmask)) except: netaddr = netmask = None bc_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) scheduler_ip_addrs = list( filter(lambda ip: bool(ip), [_node_ipaddr(scheduler_node)])) if scheduler_ip_addrs and scheduler_port: relay_request = { 'ip_addrs': scheduler_ip_addrs, 'port': scheduler_port, 'version': _dispy_version, 'sign': None } bc_sock.sendto(b'PING:' + serialize(relay_request), ('<broadcast>', node_port)) bc_sock.close() node_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) node_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) node_sock.bind(('', node_port)) sched_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sched_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) sched_sock.bind(('', scheduler_port)) logger.info('Listening on %s:%s/%s', ip_addr, node_port, scheduler_port) while True: ready = select.select([node_sock, sched_sock], [], [])[0] for sock in ready: if sock == node_sock: msg, addr = node_sock.recvfrom(1024) if not msg.startswith(b'PING:'): logger.debug('Ignoring message "%s" from %s', msg[:min(len(msg), 5)], addr[0]) continue if netaddr and \ (struct.unpack('>L', socket.inet_aton(addr[0]))[0] & netmask) == netaddr: logger.debug('Ignoring ping back (from %s)', addr[0]) continue logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: info = unserialize(msg[len(b'PING:'):]) if info['version'] != _dispy_version: logger.warning( 'Ignoring %s due to version mismatch: %s / %s', info['ip_addrs'], info['version'], _dispy_version) continue scheduler_ip_addrs = info['ip_addrs'] + [addr[0]] scheduler_port = info['port'] except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) logger.debug(traceback.format_exc()) continue logger.debug('relaying ping from %s / %s' % (info['ip_addrs'], addr[0])) bc_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) bc_sock.sendto(b'PING:' + serialize(info), ('<broadcast>', node_port)) bc_sock.close() else: assert sock == sched_sock msg, addr = sched_sock.recvfrom(1024) if msg.startswith( b'PING:' ) and scheduler_ip_addrs and scheduler_port: try: info = unserialize(msg[len(b'PING:'):]) if netaddr and info.get('scheduler_ip_addr', None) and \ (struct.unpack('>L', socket.inet_aton(info['scheduler_ip_addr']))[0] & netmask) == netaddr: logger.debug('Ignoring ping back (from %s)' % addr[0]) continue assert info['version'] == _dispy_version # assert isinstance(info['cpus'], int) msg = { 'ip_addrs': scheduler_ip_addrs, 'port': scheduler_port, 'version': _dispy_version } relay_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) relay_sock.sendto(b'PING:' + serialize(msg), (info['ip_addr'], info['port'])) relay_sock.close() except: logger.debug(traceback.format_exc()) # raise logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1])
def relay_pings(self, ip_addr='', netmask=None, node_port=51348, scheduler_node=None, scheduler_port=51347): netaddr = None if not netmask: try: ip_addr, bits = ip_addr.split('/') socket.inet_aton(ip_addr) netmask = (0xffffffff << (32 - int(bits))) & 0xffffffff netaddr = (struct.unpack('>L', socket.inet_aton(ip_addr))[0]) & netmask except: netmask = '255.255.255.255' if ip_addr: socket.inet_aton(ip_addr) else: ip_addr = socket.gethostbyname(socket.gethostname()) if not netaddr and netmask: try: if isinstance(netmask, str): netmask = struct.unpack('>L', socket.inet_aton(netmask))[0] else: assert isinstance(netmask, int) assert netmask > 0 netaddr = (struct.unpack('>L', socket.inet_aton(ip_addr))[0]) & netmask except: logger.warning('Invalid netmask') try: socket.inet_ntoa(struct.pack('>L', netaddr)) socket.inet_ntoa(struct.pack('>L', netmask)) except: netaddr = netmask = None scheduler_version = _dispy_version bc_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) bc_sock.bind(('', 0)) bc_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) scheduler_ip_addr = _node_ipaddr(scheduler_node) if scheduler_ip_addr and scheduler_port: relay_request = serialize({'scheduler_ip_addr':scheduler_ip_addr, 'scheduler_port':scheduler_port, 'version':scheduler_version}) bc_sock.sendto('PING:%s' % relay_request, ('<broadcast>', node_port)) ping_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) ping_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) ping_sock.bind(('', node_port)) pong_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) pong_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) pong_sock.bind(('', scheduler_port)) logger.info('Listening on %s:%s', ip_addr, node_port) last_ping = 0 while True: ready = select.select([ping_sock, pong_sock], [], [])[0] for sock in ready: if sock == ping_sock: msg, addr = ping_sock.recvfrom(1024) if not msg.startswith('PING:'): logger.debug('Ignoring message "%s" from %s', msg[:max(len(msg), 5)], addr[0]) continue if netaddr and (struct.unpack('>L', socket.inet_aton(addr[0]))[0] & netmask) == netaddr: logger.debug('Ignoring own ping (from %s)', addr[0]) continue if (time.time() - last_ping) < 10: logger.warning('Ignoring ping (from %s) for 10 more seconds', addr[0]) time.sleep(10) last_ping = time.time() logger.debug('Ping message from %s (%s)', addr[0], addr[1]) try: data = unserialize(msg[len('PING:'):]) scheduler_ip_addr = data['scheduler_ip_addr'] scheduler_port = data['scheduler_port'] scheduler_version = data['version'] assert isinstance(scheduler_ip_addr, str) assert isinstance(scheduler_port, int) except: logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue relay_request = serialize({'scheduler_ip_addr':scheduler_ip_addr, 'scheduler_port':scheduler_port, 'version':scheduler_version}) bc_sock.sendto('PING:%s' % relay_request, ('<broadcast>', node_port)) else: assert sock == pong_sock msg, addr = pong_sock.recvfrom(1024) if not msg.startswith('PONG:'): logger.debug('Ignoring pong message "%s" from %s', msg[:max(len(msg), 5)], addr[0]) continue # if netaddr and (struct.unpack('>L', socket.inet_aton(addr[0]))[0] & netmask) == netaddr: # logger.debug('Ignoring own pong (from %s)', addr[0]) # continue if not (scheduler_ip_addr and scheduler_port): logger.debug('Ignoring pong message from %s', str(addr)) continue logger.debug('Pong message from %s (%s)', addr[0], addr[1]) try: pong = unserialize(msg[len('PONG:'):]) assert isinstance(pong['host'], str) assert isinstance(pong['port'], int) assert isinstance(pong['cpus'], int) relay_request = serialize({'scheduler_ip_addr':scheduler_ip_addr, 'scheduler_port':scheduler_port, 'version':scheduler_version}) relay_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) relay_sock.sendto('PING:%s' % relay_request, (pong['host'], node_port)) relay_sock.close() except: # raise logger.debug('Ignoring pong message from %s (%s)', addr[0], addr[1])