def _discoro_proc(): # coroutine """Server process receives computations and runs coroutines for it. """ import os import shutil import traceback import sys import time try: import psutil except: psutil = None import asyncoro.disasyncoro as asyncoro from asyncoro import Coro from asyncoro.discoro import MinPulseInterval, MaxPulseInterval, \ DiscoroNodeInfo, DiscoroNodeStatus _discoro_coro = asyncoro.AsynCoro.cur_coro() _discoro_config = yield _discoro_coro.receive() assert _discoro_config['req'] == 'config' _discoro_coro.register('discoro_server') _discoro_name = asyncoro.AsynCoro.instance().name asyncoro.AsynCoro.instance().dest_path = os.path.join('discoro', 'server%s' % (_discoro_config['id'])) _discoro_dest_path = asyncoro.AsynCoro.instance().dest_path _discoro_pid_path = os.path.join(_discoro_dest_path, '..', 'server%s.pid' % (_discoro_config['id'])) _discoro_pid_path = os.path.normpath(_discoro_pid_path) # TODO: is file locking necessary? if os.path.exists(_discoro_pid_path): with open(_discoro_pid_path, 'r') as _discoro_req: _discoro_var = _discoro_req.read() _discoro_var = int(_discoro_var) if not _discoro_config['phoenix']: print('\n Another discoronode seems to be running;\n' ' make sure server with PID %d quit and remove "%s"\n' % (_discoro_var, _discoro_pid_path)) _discoro_var = os.getpid() import signal try: os.kill(_discoro_var, signal.SIGTERM) except: pass else: time.sleep(0.1) try: if os.waitpid(_discoro_var, os.WNOHANG)[0] != _discoro_var: asyncoro.logger.warning('Killing process %d failed' % _discoro_var) except: pass del signal if os.path.isdir(_discoro_dest_path): shutil.rmtree(_discoro_dest_path) os.makedirs(_discoro_dest_path) os.chdir(_discoro_dest_path) with open(_discoro_pid_path, 'w') as _discoro_var: _discoro_var.write('%s' % os.getpid()) asyncoro.logger.debug('discoro server "%s" started at %s; ' 'computation files will be saved in "%s"' % (_discoro_name, _discoro_coro.location, _discoro_dest_path)) _discoro_req = _discoro_client = _discoro_auth = _discoro_msg = None _discoro_timer_coro = _discoro_pulse_coro = _discoro_timer_proc = _discoro_peer_status = None _discoro_monitor_coro = _discoro_monitor_proc = _discoro_node_status = None _discoro_computation = _discoro_func = _discoro_var = None _discoro_job_coros = set() _discoro_busy_time = time.time() _discoro_globals = {} _discoro_locals = {} _discoro_globals.update(globals()) _discoro_locals.update(locals()) def _discoro_timer_proc(coro=None): coro.set_daemon() last_pulse = time.time() interval = None while True: reset = yield coro.sleep(interval) if reset: if not isinstance(_discoro_pulse_coro, Coro): interval = None continue interval = reset last_pulse = time.time() continue if not _discoro_pulse_coro: continue msg = {'ncoros': len(_discoro_job_coros), 'location': coro.location} if _discoro_node_status: msg['node_status'] = DiscoroNodeStatus(coro.location.addr, psutil.cpu_percent(), psutil.virtual_memory().percent, psutil.disk_usage(_discoro_dest_path).percent) if _discoro_pulse_coro.send(msg) == 0: last_pulse = time.time() elif (time.time() - last_pulse) > (5 * interval) and _discoro_computation: asyncoro.logger.warning('scheduler is not reachable; closing computation "%s"' % _discoro_computation._auth) _discoro_coro.send({'req': 'close', 'auth': _discoro_computation._auth}) if ((not _discoro_job_coros) and _discoro_computation.zombie_period and ((time.time() - _discoro_busy_time) > _discoro_computation.zombie_period)): asyncoro.logger.debug('%s: zombie computation "%s"' % (coro.location, _discoro_computation._auth)) # TODO: close? For now wait for "too many" timeouts to close def _discoro_peer_status(coro=None): coro.set_daemon() while True: status = yield coro.receive() if isinstance(status, asyncoro.PeerStatus) and \ status.status == asyncoro.PeerStatus.Offline and \ _discoro_pulse_coro and _discoro_pulse_coro.location == status.location: asyncoro.logger.debug('scheduler at %s quit; closing computation %s' % (status.location, _discoro_computation._auth)) msg = {'req': 'close', 'auth': _discoro_computation._auth} _discoro_coro.send(msg) def _discoro_monitor_proc(coro=None): nonlocal _discoro_busy_time coro.set_daemon() while True: msg = yield coro.receive() if isinstance(msg, asyncoro.MonitorException): _discoro_busy_time = time.time() asyncoro.logger.debug('job %s done' % msg.args[0]) _discoro_job_coros.discard(msg.args[0]) else: asyncoro.logger.warning('%s: invalid monitor message ignored' % coro.location) _discoro_timer_coro = Coro(_discoro_timer_proc) _discoro_monitor_coro = Coro(_discoro_monitor_proc) asyncoro.AsynCoro.instance().peer_status(Coro(_discoro_peer_status)) while True: _discoro_msg = yield _discoro_coro.receive() if not isinstance(_discoro_msg, dict): continue _discoro_req = _discoro_msg.get('req', None) if _discoro_req == 'run': _discoro_client = _discoro_msg.get('client', None) _discoro_auth = _discoro_msg.get('auth', None) _discoro_func = _discoro_msg.get('func', None) if not isinstance(_discoro_client, Coro) or not _discoro_computation or \ _discoro_auth != _discoro_computation._auth: asyncoro.logger.warning('invalid run: %s' % (type(_discoro_func))) if isinstance(_discoro_client, Coro): _discoro_client.send(None) continue try: _discoro_func = asyncoro.unserialize(_discoro_func) if _discoro_func.code: exec(_discoro_func.code, globals()) job_coro = Coro(globals()[_discoro_func.name], *(_discoro_func.args), **(_discoro_func.kwargs)) except: asyncoro.logger.debug('invalid computation to run') # _discoro_func = Scheduler._Function(_discoro_func.name, None, # _discoro_func.args, _discoro_func.kwargs) job_coro = (sys.exc_info()[0], getattr(_discoro_func, 'name', _discoro_func), traceback.format_exc()) else: asyncoro.logger.debug('job %s created' % job_coro) _discoro_job_coros.add(job_coro) job_coro.notify(_discoro_monitor_coro) _discoro_var = _discoro_msg.get('notify', None) if isinstance(_discoro_var, Coro): job_coro.notify(_discoro_var) _discoro_busy_time = time.time() _discoro_client.send(job_coro) del job_coro elif _discoro_req == 'setup': _discoro_client = _discoro_msg.get('client', None) _discoro_pulse_coro = _discoro_msg.get('pulse_coro', None) if not isinstance(_discoro_client, Coro) or not isinstance(_discoro_pulse_coro, Coro): continue if _discoro_computation is not None: asyncoro.logger.debug('invalid "setup" - busy') _discoro_client.send(-1) continue os.chdir(_discoro_dest_path) try: _discoro_computation = _discoro_msg['computation'] exec('import asyncoro.disasyncoro as asyncoro', globals()) if __name__ == '__mp_main__': # Windows multiprocessing process exec('import asyncoro.disasyncoro as asyncoro', sys.modules['__mp_main__'].__dict__) if _discoro_computation._code: exec(_discoro_computation._code, globals()) if __name__ == '__mp_main__': # Windows multiprocessing process exec(_discoro_computation._code, sys.modules['__mp_main__'].__dict__) except: _discoro_computation = None asyncoro.logger.warning('invalid computation') asyncoro.logger.debug(traceback.format_exc()) _discoro_client.send(-1) continue if psutil and _discoro_msg.get('node_status', None): _discoro_node_status = True if isinstance(_discoro_computation.pulse_interval, int) and \ MinPulseInterval <= _discoro_computation.pulse_interval <= MaxPulseInterval: _discoro_computation.pulse_interval = _discoro_computation.pulse_interval else: _discoro_computation.pulse_interval = MinPulseInterval _discoro_timer_coro.resume(_discoro_computation.pulse_interval) _discoro_busy_time = time.time() asyncoro.logger.debug('computation "%s" from %s' % (_discoro_computation._auth, _discoro_msg['client'].location)) _discoro_client.send(0) elif _discoro_req == 'close': _discoro_auth = _discoro_msg.get('auth', None) if not _discoro_computation or (_discoro_auth != _discoro_computation._auth and _discoro_auth != _discoro_config['auth']): continue asyncoro.logger.debug('%s deleting computation "%s"' % (_discoro_coro.location, _discoro_computation._auth)) if _discoro_auth != _discoro_computation._auth and _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerClosed', 'location': _discoro_coro.location}) for _discoro_var in _discoro_job_coros: _discoro_var.terminate() _discoro_job_coros = set() if __name__ == '__mp_main__': # Windows multiprocessing process for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None) globals().update(_discoro_globals) sys.modules['__mp_main__'].__dict__.update(_discoro_globals) else: for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) globals().update(_discoro_globals) for _discoro_var in os.listdir(_discoro_dest_path): _discoro_var = os.path.join(_discoro_dest_path, _discoro_var) if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var): shutil.rmtree(_discoro_var, ignore_errors=True) else: os.remove(_discoro_var) if not os.path.isdir(_discoro_dest_path): try: os.remove(_discoro_dest_path) except: pass os.makedirs(_discoro_dest_path) if not os.path.isfile(_discoro_pid_path): try: if os.path.islink(_discoro_pid_path): os.remove(_discoro_pid_path) else: shutil.rmtree(_discoro_pid_path) with open(_discoro_pid_path, 'w') as _discoro_var: _discoro_var.write('%s' % os.getpid()) except: asyncoro.logger.warning('PID file "%s" is invalid' % _discoro_pid_path) os.chdir(_discoro_dest_path) asyncoro.AsynCoro.instance().dest_path = _discoro_dest_path _discoro_computation = _discoro_client = _discoro_pulse_coro = None _discoro_node_status = None if _discoro_config['serve'] > 0: _discoro_config['serve'] -= 1 if _discoro_config['serve'] == 0: break _discoro_timer_coro.resume(MinPulseInterval) elif _discoro_req == 'node_info': if psutil: info = DiscoroNodeInfo( _discoro_name, _discoro_coro.location.addr, psutil.cpu_count(), psutil.cpu_percent(), {_discoro_var: getattr(psutil.virtual_memory(), _discoro_var) for _discoro_var in ['total', 'percent']}, {_discoro_var: getattr(psutil.disk_usage(_discoro_dest_path), _discoro_var) for _discoro_var in ['total', 'percent']} ) if _discoro_msg.get('node_status', None): _discoro_node_status = True else: info = DiscoroNodeInfo(_discoro_name, _discoro_coro.location.addr, -1, -1, None, None) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(info) elif _discoro_req == 'status': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring info: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: print(' Server %s running %d coroutines for computation at %s' % (_discoro_coro.location, len(_discoro_job_coros), _discoro_pulse_coro.location)) else: print(' Server %s not used by any computation' % (_discoro_coro.location)) elif _discoro_req == 'quit': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring quit: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerClosed', 'location': _discoro_coro.location}) break elif _discoro_req == 'terminate': if _discoro_msg.get('auth', None) != _discoro_config['auth']: asyncoro.logger.debug('ignoring terminate: %s' % (_discoro_msg.get('auth'))) continue if _discoro_pulse_coro: _discoro_pulse_coro.send({'status': 'ServerTerminated', 'location': _discoro_coro.location}) if _discoro_computation: msg = {'req': 'close', 'auth': _discoro_computation._auth} _discoro_config['serve'] = 1 _discoro_coro.send(msg) else: break else: asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(-1) # wait until all computations are done; process only 'close' while _discoro_job_coros: _discoro_msg = yield _discoro_coro.receive() if not isinstance(_discoro_msg, dict): continue _discoro_req = _discoro_msg.get('req', None) if _discoro_req == 'close': _discoro_auth = _discoro_msg.get('auth', None) if not _discoro_computation or _discoro_auth != _discoro_computation._auth: continue asyncoro.logger.debug('%s deleting computation "%s"' % (_discoro_coro.location, _discoro_computation._auth)) if __name__ == '__mp_main__': # Windows multiprocessing process for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) sys.modules['__mp_main__'].__dict__.pop(_discoro_var, None) globals().update(_discoro_globals) sys.modules['__mp_main__'].__dict__.update(_discoro_globals) else: for _discoro_var in list(globals()): if _discoro_var not in _discoro_globals: globals().pop(_discoro_var, None) globals().update(_discoro_globals) break else: asyncoro.logger.warning('invalid command "%s" ignored' % _discoro_req) _discoro_client = _discoro_msg.get('client', None) if not isinstance(_discoro_client, Coro): continue _discoro_client.send(-1) for _discoro_var in os.listdir(_discoro_dest_path): _discoro_var = os.path.join(_discoro_dest_path, _discoro_var) if os.path.isdir(_discoro_var) and not os.path.islink(_discoro_var): shutil.rmtree(_discoro_var, ignore_errors=True) else: os.remove(_discoro_var) if os.path.isfile(_discoro_pid_path): os.remove(_discoro_pid_path) _discoro_config['mp_queue'].put(_discoro_config['auth']) asyncoro.logger.debug('discoro server %s quit' % _discoro_coro.location)
class _DispyNode(object): """Internal use only. """ def __init__(self, cpus, ip_addr=None, ext_ip_addr=None, node_port=None, scheduler_node=None, scheduler_port=None, dest_path_prefix='', secret='', keyfile=None, certfile=None, max_file_size=None, zombie_interval=60): assert 0 < cpus <= multiprocessing.cpu_count() self.cpus = cpus if ip_addr: ip_addr = _node_ipaddr(ip_addr) if not ip_addr: raise Exception('invalid ip_addr') else: self.name = socket.gethostname() ip_addr = socket.gethostbyname(self.name) if ext_ip_addr: ext_ip_addr = _node_ipaddr(ext_ip_addr) if not ext_ip_addr: raise Exception('invalid ext_ip_addr') else: ext_ip_addr = ip_addr try: self.name = socket.gethostbyaddr(ext_ip_addr)[0] except: self.name = socket.gethostname() if not node_port: node_port = 51348 if not scheduler_port: scheduler_port = 51347 self.ip_addr = ip_addr self.ext_ip_addr = ext_ip_addr self.scheduler_port = scheduler_port self.pulse_interval = None self.keyfile = keyfile self.certfile = certfile if self.keyfile: self.keyfile = os.path.abspath(self.keyfile) if self.certfile: self.certfile = os.path.abspath(self.certfile) self.asyncoro = AsynCoro() self.tcp_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if self.certfile: self.tcp_sock = ssl.wrap_socket(self.tcp_sock, keyfile=self.keyfile, certfile=self.certfile) self.tcp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.tcp_sock.bind((self.ip_addr, node_port)) self.address = self.tcp_sock.getsockname() self.tcp_sock.listen(30) if dest_path_prefix: self.dest_path_prefix = dest_path_prefix.strip().rstrip(os.sep) else: self.dest_path_prefix = os.path.join(os.sep, 'tmp', 'dispy') if not os.path.isdir(self.dest_path_prefix): os.makedirs(self.dest_path_prefix) os.chmod(self.dest_path_prefix, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) if max_file_size is None: max_file_size = MaxFileSize self.max_file_size = max_file_size self.avail_cpus = self.cpus self.computations = {} self.scheduler_ip_addr = None self.file_uses = {} self.job_infos = {} self.lock = asyncoro.Lock() self.terminate = False self.signature = os.urandom(20).encode('hex') self.auth_code = hashlib.sha1(self.signature + secret).hexdigest() self.zombie_interval = 60 * zombie_interval logger.debug('auth_code for %s: %s', ip_addr, self.auth_code) self.udp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) self.udp_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) self.udp_sock.bind(('', node_port)) logger.info('serving %s cpus at %s:%s', self.cpus, self.ip_addr, node_port) logger.debug('tcp server at %s:%s', self.address[0], self.address[1]) self.udp_sock = AsynCoroSocket(self.udp_sock, blocking=False) scheduler_ip_addr = _node_ipaddr(scheduler_node) self.reply_Q = multiprocessing.Queue() self.reply_Q_thread = threading.Thread(target=self.__reply_Q) self.reply_Q_thread.start() self.timer_coro = Coro(self.timer_task) # self.tcp_coro = Coro(self.tcp_server) self.udp_coro = Coro(self.udp_server, scheduler_ip_addr) def send_pong_msg(self, coro=None): ping_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) ping_sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) ping_sock = AsynCoroSocket(ping_sock, blocking=False) pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1], 'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version} pong_msg = 'PONG:' + serialize(pong_msg) yield ping_sock.sendto(pong_msg, ('<broadcast>', self.scheduler_port)) ping_sock.close() def udp_server(self, scheduler_ip_addr, coro=None): assert coro is not None coro.set_daemon() if self.avail_cpus == self.cpus: yield self.send_pong_msg(coro=coro) pong_msg = {'ip_addr':self.ext_ip_addr, 'name':self.name, 'port':self.address[1], 'cpus':self.cpus, 'sign':self.signature, 'version':_dispy_version} pong_msg = 'PONG:' + serialize(pong_msg) if scheduler_ip_addr: sock = AsynCoroSocket(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) try: yield sock.sendto(pong_msg, (scheduler_ip_addr, self.scheduler_port)) except: logger.warning("Couldn't send ping message to %s:%s", scheduler_ip_addr, self.scheduler_port) finally: sock.close() while True: msg, addr = yield self.udp_sock.recvfrom(1024) # TODO: process each message as separate Coro, so # exceptions are contained? if msg.startswith('PING:'): if self.cpus != self.avail_cpus: logger.debug('Busy (%s/%s); ignoring ping message from %s', self.cpus, self.avail_cpus, addr[0]) continue try: info = unserialize(msg[len('PING:'):]) socket.inet_aton(info['scheduler_ip_addr']) assert isinstance(info['scheduler_port'], int) assert info['version'] == _dispy_version addr = (info['scheduler_ip_addr'], info['scheduler_port']) except: # raise logger.debug('Ignoring ping message from %s (%s)', addr[0], addr[1]) continue yield self.udp_sock.sendto(pong_msg, addr) elif msg.startswith('PULSE:'): try: info = unserialize(msg[len('PULSE:'):]) assert info['ip_addr'] == self.scheduler_ip_addr yield self.lock.acquire() for compute in self.computations.itervalues(): compute.last_pulse = time.time() yield self.lock.release() except: logger.warning('Ignoring PULSE from %s', addr[0]) elif msg.startswith('SERVERPORT:'): try: req = unserialize(msg[len('SERVERPORT:'):]) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) reply = {'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature, 'version':_dispy_version} sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) yield sock.sendto(serialize(reply), (req['ip_addr'], req['port'])) sock.close() except: logger.debug(traceback.format_exc()) # pass else: logger.warning('Ignoring ping message from %s', addr[0]) def tcp_serve_task(self, conn, addr, coro=None): conn = AsynCoroSocket(conn, blocking=False, keyfile=self.keyfile, certfile=self.certfile) def job_request_task(msg): assert coro is not None try: _job = unserialize(msg) except: logger.debug('Ignoring job request from %s', addr[0]) logger.debug(traceback.format_exc()) raise StopIteration yield self.lock.acquire() compute = self.computations.get(_job.compute_id, None) if compute is not None: if compute.scheduler_ip_addr != self.scheduler_ip_addr: compute = None yield self.lock.release() if self.avail_cpus == 0: logger.warning('All cpus busy') try: yield conn.send_msg('NAK (all cpus busy)') except: pass raise StopIteration elif compute is None: logger.warning('Invalid computation %s', _job.compute_id) try: yield conn.send_msg('NAK (invalid computation %s)' % _job.compute_id) except: pass raise StopIteration reply_addr = (compute.scheduler_ip_addr, compute.job_result_port) logger.debug('New job id %s from %s', _job.uid, addr[0]) files = [] for f in _job.files: tgt = os.path.join(compute.dest_path, os.path.basename(f['name'])) try: fd = open(tgt, 'wb') fd.write(f['data']) fd.close() except: logger.warning('Could not save file "%s"', tgt) continue try: os.utime(tgt, (f['stat'].st_atime, f['stat'].st_mtime)) os.chmod(tgt, stat.S_IMODE(f['stat'].st_mode)) except: logger.debug('Could not set modes for "%s"', tgt) files.append(tgt) _job.files = files if compute.type == _Compute.func_type: reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) args = (job_info, self.certfile, self.keyfile, _job.args, _job.kwargs, self.reply_Q, compute.name, compute.code, compute.dest_path, _job.files) try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration job_info.job_reply.status = DispyJob.Running job_info.proc = multiprocessing.Process(target=_dispy_job_func, args=args) yield self.lock.acquire() self.avail_cpus -= 1 compute.pending_jobs += 1 self.job_infos[_job.uid] = job_info self.lock.release() job_info.proc.start() raise StopIteration elif compute.type == _Compute.prog_type: try: yield conn.send_msg('ACK') except: logger.warning('Failed to send response for new job to %s', str(addr)) raise StopIteration reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) job_info.job_reply.status = DispyJob.Running yield self.lock.acquire() self.job_infos[_job.uid] = job_info self.avail_cpus -= 1 compute.pending_jobs += 1 yield self.lock.release() prog_thread = threading.Thread(target=self.__job_program, args=(_job, job_info)) prog_thread.start() raise StopIteration else: try: yield conn.send_msg('NAK (invalid computation type "%s")' % compute.type) except: logger.warning('Failed to send response for new job to %s', str(addr)) def add_computation_task(msg): assert coro is not None try: compute = unserialize(msg) except: logger.debug('Ignoring computation request from %s', addr[0]) try: yield conn.send_msg('Invalid computation request') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration yield self.lock.acquire() if not ((self.scheduler_ip_addr is None) or (self.scheduler_ip_addr == compute.scheduler_ip_addr and \ self.scheduler_port == compute.scheduler_port)): logger.debug('Ignoring computation request from %s: %s, %s, %s', compute.scheduler_ip_addr, self.scheduler_ip_addr, self.avail_cpus, self.cpus) self.lock.release() try: yield conn.send_msg('Busy') except: pass raise StopIteration resp = 'ACK' if compute.dest_path and isinstance(compute.dest_path, str): compute.dest_path = compute.dest_path.strip(os.sep) else: for x in xrange(20): compute.dest_path = os.urandom(8).encode('hex') if compute.dest_path.find(os.sep) >= 0: continue if not os.path.isdir(os.path.join(self.dest_path_prefix, compute.dest_path)): break else: logger.warning('Failed to create unique dest_path: %s', compute.dest_path) resp = 'NACK' compute.dest_path = os.path.join(self.dest_path_prefix, compute.dest_path) try: os.makedirs(compute.dest_path) os.chmod(compute.dest_path, stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR) logger.debug('dest_path for "%s": %s', compute.name, compute.dest_path) except: logger.warning('Invalid destination path: "%s"', compute.dest_path) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Invalid dest_path)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration if compute.id in self.computations: logger.warning('Computation "%s" (%s) is being replaced', compute.name, compute.id) setattr(compute, 'last_pulse', time.time()) setattr(compute, 'pending_jobs', 0) setattr(compute, 'pending_results', 0) setattr(compute, 'zombie', False) logger.debug('xfer_files given: %s', ','.join(xf.name for xf in compute.xfer_files)) if compute.type == _Compute.func_type: try: code = compile(compute.code, '<string>', 'exec') except: logger.warning('Computation "%s" could not be compiled', compute.name) if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) self.lock.release() try: yield conn.send_msg('NACK (Compilation failed)') except: logger.warning('Failed to send reply to %s', str(addr)) raise StopIteration compute.code = marshal.dumps(code) elif compute.type == _Compute.prog_type: assert not compute.code compute.name = os.path.join(compute.dest_path, os.path.basename(compute.name)) xfer_files = [] for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) try: if _same_file(tgt, xf): logger.debug('Ignoring file "%s" / "%s"', xf.name, tgt) if tgt not in self.file_uses: self.file_uses[tgt] = 0 self.file_uses[tgt] += 1 continue except: pass if self.max_file_size and xf.stat_buf.st_size > self.max_file_size: resp = 'NACK (file "%s" too big)' % xf.name else: xfer_files.append(xf) if resp == 'ACK' and ((self.scheduler_ip_addr is not None) and \ (self.scheduler_ip_addr != compute.scheduler_ip_addr)): resp = 'NACK (busy)' if resp == 'ACK': self.computations[compute.id] = compute self.scheduler_ip_addr = compute.scheduler_ip_addr self.scheduler_port = compute.scheduler_port self.pulse_interval = compute.pulse_interval self.lock.release() if xfer_files: resp += ':XFER_FILES:' + serialize(xfer_files) try: yield conn.send_msg(resp) except: assert self.scheduler_ip_addr == compute.scheduler_ip_addr yield self.lock.acquire() del self.computations[compute.id] self.scheduler_ip_addr = None self.scheduler_port = None self.pulse_interval = None self.lock.release() else: self.timer_coro.resume(True) else: self.lock.release() if os.path.isdir(compute.dest_path): os.rmdir(compute.dest_path) try: yield conn.send_msg(resp) except: pass def xfer_file_task(msg): assert coro is not None try: xf = unserialize(msg) except: logger.debug('Ignoring file trasnfer request from %s', addr[0]) raise StopIteration resp = '' if xf.compute_id not in self.computations: logger.error('computation "%s" is invalid' % xf.compute_id) raise StopIteration tgt = os.path.join(self.computations[xf.compute_id].dest_path, os.path.basename(xf.name)) if os.path.isfile(tgt): if _same_file(tgt, xf): yield self.lock.acquire() if tgt in self.file_uses: self.file_uses[tgt] += 1 else: self.file_uses[tgt] = 1 yield self.lock.release() resp = 'ACK' else: logger.warning('File "%s" already exists with different status as "%s"', xf.name, tgt) if not resp: logger.debug('Copying file %s to %s (%s)', xf.name, tgt, xf.stat_buf.st_size) try: fd = open(tgt, 'wb') n = 0 while n < xf.stat_buf.st_size: data = yield conn.recvall(min(xf.stat_buf.st_size-n, 10240000)) if not data: break fd.write(data) n += len(data) if self.max_file_size and n > self.max_file_size: logger.warning('File "%s" is too big (%s); it is truncated', tgt, n) break fd.close() if n < xf.stat_buf.st_size: resp = 'NAK (read only %s bytes)' % n else: resp = 'ACK' logger.debug('Copied file %s, %s', tgt, resp) os.utime(tgt, (xf.stat_buf.st_atime, xf.stat_buf.st_mtime)) os.chmod(tgt, stat.S_IMODE(xf.stat_buf.st_mode)) self.file_uses[tgt] = 1 except: logger.warning('Copying file "%s" failed with "%s"', xf.name, traceback.format_exc()) resp = 'NACK' try: yield conn.send_msg(resp) except: logger.debug('Could not send reply for "%s"', xf.name) raise StopIteration # xfer_file_task def terminate_job_task(msg): assert coro is not None yield self.lock.acquire() try: _job = unserialize(msg) compute = self.computations[_job.compute_id] assert addr[0] == compute.scheduler_ip_addr job_info = self.job_infos.pop(_job.uid, None) except: logger.debug('Ignoring job request from %s', addr[0]) raise StopIteration finally: self.lock.release() if job_info is None: logger.debug('Job %s completed; ignoring cancel request from %s', _job.uid, addr[0]) raise StopIteration logger.debug('Terminating job %s', _job.uid) job_info.proc.terminate() if isinstance(job_info.proc, multiprocessing.Process): for x in xrange(20): if job_info.proc.is_alive(): yield coro.sleep(0.1) else: logger.debug('Process "%s" for job %s terminated', compute.name, _job.uid) break else: logger.warning('Could not kill process %s', compute.name) raise StopIteration else: assert isinstance(job_info.proc, subprocess.Popen) for x in xrange(20): rc = job_info.proc.poll() logger.debug('Program "%s" for job %s terminated with %s', compute.name, _job.uid, rc) if rc is not None: break if x == 10: logger.debug('Killing job %s', _job.uid) job_info.proc.kill() yield coro.sleep(0.1) else: logger.warning('Could not kill process %s', compute.name) raise StopIteration reply_addr = (addr[0], compute.job_result_port) reply = _JobReply(_job, self.ext_ip_addr) job_info = _DispyJobInfo(reply, reply_addr, compute) reply.status = DispyJob.Terminated yield self._send_job_reply(job_info, resending=False, coro=coro) def retrieve_job_task(msg): assert coro is not None try: req = unserialize(msg) assert req['uid'] is not None assert req['hash'] is not None assert req['compute_id'] is not None except: resp = serialize('Invalid job') try: yield conn.send_msg(resp) except: pass raise StopIteration job_info = self.job_infos.get(req['uid'], None) resp = None if job_info is not None: try: yield conn.send_msg(serialize(job_info.job_reply)) ack = yield conn.recv_msg() # no need to check ack except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration for d in os.listdir(self.dest_path_prefix): info_file = os.path.join(self.dest_path_prefix, d, '_dispy_job_reply_%s' % req['uid']) if os.path.isfile(info_file): try: fd = open(info_file, 'rb') job_reply = pickle.load(fd) fd.close() except: job_reply = None if hasattr(job_reply, 'hash') and job_reply.hash == req['hash']: try: yield conn.send_msg(serialize(job_reply)) ack = yield conn.recv_msg() assert ack == 'ACK' except: logger.debug('Could not send reply for job %s', req['uid']) raise StopIteration try: os.remove(info_file) yield self.lock.acquire() compute = self.computations.get(req['compute_id'], None) if compute is not None: compute.pending_results -= 1 if compute.pending_results == 0: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Could not remove "%s"', info_file) raise StopIteration else: resp = serialize('Invalid job: %s' % req['uid']) if resp: try: yield conn.send_msg(resp) except: pass # tcp_serve_task starts try: req = yield conn.recvall(len(self.auth_code)) assert req == self.auth_code except: logger.warning('Ignoring request; invalid client authentication?') conn.close() raise StopIteration msg = yield conn.recv_msg() if not msg: conn.close() raise StopIteration if msg.startswith('JOB:'): msg = msg[len('JOB:'):] yield job_request_task(msg) conn.close() elif msg.startswith('COMPUTE:'): msg = msg[len('COMPUTE:'):] yield add_computation_task(msg) conn.close() elif msg.startswith('FILEXFER:'): msg = msg[len('FILEXFER:'):] yield xfer_file_task(msg) conn.close() elif msg.startswith('DEL_COMPUTE:'): msg = msg[len('DEL_COMPUTE:'):] try: info = unserialize(msg) compute_id = info['ID'] yield self.lock.acquire() compute = self.computations.get(compute_id, None) if compute is None: logger.warning('Computation "%s" is not valid', compute_id) else: compute.zombie = True self.cleanup_computation(compute) self.lock.release() except: logger.debug('Deleting computation failed with %s', traceback.format_exc()) # raise conn.close() elif msg.startswith('TERMINATE_JOB:'): msg = msg[len('TERMINATE_JOB:'):] yield terminate_job_task(msg) conn.close() elif msg.startswith('RETRIEVE_JOB:'): msg = msg[len('RETRIEVE_JOB:'):] yield retrieve_job_task(msg) conn.close() else: logger.warning('Invalid request "%s" from %s', msg[:min(10, len(msg))], addr[0]) resp = 'NAK (invalid command: %s)' % (msg[:min(10, len(msg))]) try: yield conn.send_msg(resp) except: logger.warning('Failed to send reply to %s', str(addr)) conn.close() def timer_task(self, coro=None): coro.set_daemon() reset = True last_pulse_time = last_zombie_time = time.time() while True: if reset: if self.pulse_interval and self.zombie_interval: timeout = min(self.pulse_interval, self.zombie_interval) self.zombie_interval = max(5 * self.pulse_interval, self.zombie_interval) else: timeout = max(self.pulse_interval, self.zombie_interval) self.zombie_interval = self.zombie_interval reset = yield coro.suspend(timeout) now = time.time() if self.pulse_interval and (now - last_pulse_time) >= self.pulse_interval: n = self.cpus - self.avail_cpus assert n >= 0 if n > 0 and self.scheduler_ip_addr: last_pulse_time = now msg = 'PULSE:' + serialize({'ip_addr':self.ext_ip_addr, 'port':self.udp_sock.getsockname()[1], 'cpus':n}) sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) yield sock.sendto(msg, (self.scheduler_ip_addr, self.scheduler_port)) sock.close() if self.zombie_interval and (now - last_zombie_time) >= self.zombie_interval: last_zombie_time = now yield self.lock.acquire() for compute in self.computations.itervalues(): if (now - compute.last_pulse) > self.zombie_interval: compute.zombie = True zombies = [compute for compute in self.computations.itervalues() \ if compute.zombie and compute.pending_jobs == 0] for compute in zombies: logger.debug('Deleting zombie computation "%s"', compute.name) self.cleanup_computation(compute) phoenix = [compute for compute in self.computations.itervalues() \ if not compute.zombie and compute.pending_results] for compute in phoenix: files = [f for f in os.listdir(compute.dest_path) \ if f.startswith('_dispy_job_reply_')] # limit number queued so as not to take up too much time files = files[:min(len(files), 128)] for f in files: result_file = os.path.join(compute.dest_path, f) try: fd = open(result_file, 'rb') job_result = pickle.load(fd) fd.close() except: logger.debug('Could not load "%s"', result_file) logger.debug(traceback.format_exc()) continue try: os.remove(result_file) except: logger.debug('Could not remove "%s"', result_file) compute.pending_results -= 1 job_info = _DispyJobInfo(job_result, (compute.scheduler_ip_addr, compute.job_result_port), compute) Coro(self._send_job_reply, job_info, resending=True) self.lock.release() for compute in zombies: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(1) logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr) data = serialize({'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature}) yield sock.sendto('TERMINATED:%s' % data, (compute.scheduler_ip_addr, compute.scheduler_port)) sock.close() if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus: self.pulse_interval = None reset = True yield self.send_pong_msg(coro=coro) def __job_program(self, _job, job_info): compute = self.computations[_job.compute_id] program = [compute.name] args = unserialize(_job.args) program.extend(args) logger.debug('Executing "%s"', str(program)) reply = job_info.job_reply try: os.chdir(compute.dest_path) env = {} env.update(os.environ) env['PATH'] = compute.dest_path + ':' + env['PATH'] job_info.proc = subprocess.Popen(program, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) assert isinstance(job_info.proc, subprocess.Popen) reply.stdout, reply.stderr = job_info.proc.communicate() reply.result = job_info.proc.returncode reply.status = DispyJob.Finished except: logger.debug('Executing %s failed with %s', str(program), str(sys.exc_info())) reply.exception = traceback.format_exc() reply.status = DispyJob.Terminated self.reply_Q.put(reply) def __reply_Q(self): while True: job_reply = self.reply_Q.get() if job_reply is None: break job_info = self.job_infos.pop(job_reply.uid, None) if job_info is not None: if job_info.proc is not None: if isinstance(job_info.proc, multiprocessing.Process): job_info.proc.join(2) else: job_info.proc.wait() job_info.job_reply = job_reply Coro(self._send_job_reply, job_info, resending=False).value() def _send_job_reply(self, job_info, resending=False, coro=None): """Internal use only. """ assert coro is not None job_reply = job_info.job_reply logger.debug('Sending result for job %s (%s) to %s', job_reply.uid, job_reply.status, str(job_info.reply_addr)) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock = AsynCoroSocket(sock, blocking=False, certfile=self.certfile, keyfile=self.keyfile) sock.settimeout(2) try: yield sock.connect(job_info.reply_addr) yield sock.send_msg(serialize(job_reply)) ack = yield sock.recv_msg() assert ack == 'ACK' except: logger.error("Couldn't send results for %s to %s", job_reply.uid, str(job_info.reply_addr)) # store job result even if computation has not enabled # fault recovery; user may be able to access node and # retrieve result manually f = os.path.join(job_info.compute_dest_path, '_dispy_job_reply_%s' % job_reply.uid) logger.debug('storing results for job %s', job_reply.uid) try: fd = open(f, 'wb') pickle.dump(job_reply, fd) fd.close() except: logger.debug('Could not save results for job %s', job_reply.uid) else: yield self.lock.acquire() compute = self.computations.get(job_info.compute_id, None) if compute is not None: compute.pending_results += 1 self.lock.release() finally: sock.close() if not resending: yield self.lock.acquire() self.avail_cpus += 1 compute = self.computations.get(job_info.compute_id, None) if compute is None: logger.warning('Computation for %s / %s is invalid!', job_reply.uid, job_info.compute_id) else: # technically last_pulse should be updated only # when successfully sent reply, but no harm if done # otherwise, too compute.last_pulse = time.time() compute.pending_jobs -= 1 if compute.pending_jobs == 0 and compute.zombie: self.cleanup_computation(compute) self.lock.release() def cleanup_computation(self, compute): # called with lock held if not compute.zombie: return if compute.pending_jobs != 0: logger.debug('pending jobs for computation "%s"/%s: %s', compute.name, compute.id, compute.pending_jobs) if compute.pending_jobs > 0: return del self.computations[compute.id] if compute.scheduler_ip_addr == self.scheduler_ip_addr and \ all(c.scheduler_ip_addr != self.scheduler_ip_addr \ for c in self.computations.itervalues()): assert self.avail_cpus == self.cpus self.scheduler_ip_addr = None self.pulse_interval = None if self.scheduler_ip_addr is None and self.avail_cpus == self.cpus: self.timer_coro.resume(True) Coro(self.send_pong_msg) if compute.cleanup is False: return for xf in compute.xfer_files: tgt = os.path.join(compute.dest_path, os.path.basename(xf.name)) if tgt not in self.file_uses: logger.debug('File "%s" is unknown', tgt) continue self.file_uses[tgt] -= 1 if self.file_uses[tgt] == 0: del self.file_uses[tgt] if tgt == xf: logger.debug('Not removing file "%s"', xf.name) else: logger.debug('Removing file "%s"', tgt) try: os.remove(tgt) if os.path.splitext(tgt)[1] == '.py' and os.path.isfile(tgt + 'c'): os.remove(tgt + 'c') except: logger.warning('Could not remove file "%s"', tgt) if os.path.isdir(compute.dest_path) and \ compute.dest_path.startswith(self.dest_path_prefix) and \ len(compute.dest_path) > len(self.dest_path_prefix) and \ len(os.listdir(compute.dest_path)) == 0: logger.debug('Removing "%s"', compute.dest_path) try: os.rmdir(compute.dest_path) except: logger.warning('Could not remove directory "%s"', compute.dest_path) def shutdown(self): def _shutdown(self, coro=None): assert coro is not None yield self.lock.acquire() job_infos = self.job_infos self.job_infos = {} computations = self.computations.items() self.computations = {} if self.reply_Q: self.reply_Q.put(None) self.lock.release() for uid, job_info in job_infos.iteritems(): job_info.proc.terminate() logger.debug('process for %s is killed', uid) if isinstance(job_info.proc, multiprocessing.Process): job_info.proc.join(2) else: job_info.proc.wait() for cid, compute in computations: sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) sock = AsynCoroSocket(sock, blocking=False) sock.settimeout(2) logger.debug('Sending TERMINATE to %s', compute.scheduler_ip_addr) data = serialize({'ip_addr':self.address[0], 'port':self.address[1], 'sign':self.signature}) yield sock.sendto('TERMINATED:' + data, (compute.scheduler_ip_addr, compute.scheduler_port)) sock.close() Coro(_shutdown, self).value() self.asyncoro.join() self.asyncoro.terminate()