class OBCIServer(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, rep_addresses=None, pub_addresses=None, name='obci_server'): self.experiments = {} self.exp_process_supervisors = {} self._nearby_servers = net.DNS() super(OBCIServer, self).__init__(None, rep_addresses, pub_addresses, name) self.machine = socket.gethostname() self.rep_port = int(net.server_rep_port()) self.pub_port = int(net.server_pub_port()) bcast_port = int(net.server_bcast_port()) self._nearby_servers.logger = self.logger self._bcast_server = threading.Thread(target=broadcast_server, args=[self.uuid, self.rep_port, self.pub_port, bcast_port]) self._bcast_server.daemon = True self._bcast_server.start() self._nearby_updater = threading.Thread(target=update_nearby_servers, args=[self._nearby_servers, bcast_port, self.ctx, self._push_addr]) self._nearby_updater.daemon = True self._nearby_updater.start() self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def nearby_server_addrs(self): snap = self._nearby_servers.snapshot() return [srv.ip for srv in snap.values()] def nearby_servers(self): return self._nearby_servers.snapshot() def my_ip(self): addr = "127.0.1.1" try: addr = self._nearby_servers.this_addr_network() except Exception as e: self.logger.error(str(e)) return addr def network_ready(self): # i know my network IP return self.my_ip() != self.machine def handle_socket_read_error(self, socket, error): if socket == self.rep_socket: self.logger.warning("reinitialising REP socket") self._all_sockets.remove(self.rep_socket) if socket in self.client_rq: self.client_rq = None self.rep_socket.close() # linger=0) self.rep_socket = None time.sleep(0.2) (self.rep_socket, self.rep_addresses) = self._init_socket( ['tcp://*:' + str(self.rep_port)], zmq.REP) self.rep_socket.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.rep_socket) self.logger.info(self.rep_addresses) elif socket == self.exp_rep: self.logger.info("reinitialising EXPERIMENT REP socket") self.exp_rep.close() # linger=0) (self.exp_rep, self.exp_rep_addrs) = self._init_socket( self.exp_rep_addrs, zmq.REP) self.exp_rep.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.exp_rep) def peer_type(self): return 'obci_server' def net_init(self): (self.exp_rep, self.exp_rep_addrs) = self._init_socket( [], zmq.REP) # (self.exp_pub, self.exp_pub_addrs) = self._init_socket( # [], zmq.PUB) # self.exp_pub.setsockopt(zmq.LINGER, 0) self._all_sockets.append(self.exp_rep) # self._all_sockets.append(self.exp_pub) tcp_port = int(net.server_tcp_proxy_port()) self._tcp_proxy_thr, tcp_port = twisted_tcp_handling.run_twisted_server( ('0.0.0.0', tcp_port), self.ctx, self.rep_addresses[0]) self.tcp_addresses = [(self.my_ip(), tcp_port), (socket.gethostname(), tcp_port)] super(OBCIServer, self).net_init() def custom_sockets(self): return [self.exp_rep] # , self.srv_rep, self.srv_pub] def clean_up(self): # self._tcp_srv.shutdown() pass def cleanup_before_net_shutdown(self, kill_message, sock=None): send_msg(self._publish_socket, # self.exp_pub, self.mtool.fill_msg("kill", receiver="")) send_msg(self._publish_socket, self.mtool.fill_msg("launcher_shutdown", sender=self.uuid)) for sup in self.exp_process_supervisors: self.exp_process_supervisors[sup].kill() self.logger.info('sent KILL to experiments') def _args_for_experiment(self, sandbox_dir, launch_file, local=False, name=None, overwrites=None): args = ['--sv-addresses'] args += self.exp_rep_addrs args.append('--sv-pub-addresses') # if local: # addrs = net.choose_local(self.exp_pub_addrs) # else: # addrs = net.choose_not_local(self.exp_pub_addrs) addrs = net.choose_local(self.pub_addresses) # self.exp_pub_addrs args += addrs exp_name = name if name else os.path.basename(launch_file) args += [ '--sandbox-dir', str(sandbox_dir), '--launch-file', str(launch_file), '--name', exp_name, '--current-ip', self.my_ip()] if overwrites is not None: args += peer_cmd.peer_overwrites_cmd(overwrites) # print '{0} [{1}] -- experiment args: {2}'.format(self.name, self.peer_type(), args) return args def start_experiment_process(self, sandbox_dir, launch_file, name=None, overwrites=None): path = 'obci_experiment' args = self._args_for_experiment(sandbox_dir, launch_file, local=True, name=name, overwrites=overwrites) return self.subprocess_mgr.new_local_process(path, args, proc_type='obci_experiment', capture_io=NO_STDIO) def handle_register_experiment(self, message, sock): machine, pid = message.other_params['origin_machine'], message.other_params['pid'] status, det = message.other_params['status_name'], message.other_params['details'] launch_file = message.other_params['launch_file_path'] tcp_addr = message.other_params['tcp_addrs'] exp_proc = self.subprocess_mgr.process(machine, pid) if exp_proc is None: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="experiment_not_found")) return info = self.experiments[message.uuid] = ExperimentInfo(message.uuid, message.name, message.rep_addrs, message.pub_addrs, time.time(), machine, pid, status, det, launch_file, tcp_addr, self._nearby_servers.this_addr_network()) exp_proc.registered(info) for addrs in [info.rep_addrs, info.pub_addrs]: one = addrs[0] port = net.port(one) addrs = [self._nearby_servers.this_addr_network() + ':' + str(port)] + addrs info_msg = self.mtool.fill_msg("experiment_created", uuid=info.uuid, name=info.name, rep_addrs=info.rep_addrs, pub_addrs=info.pub_addrs, origin_machine=info.origin_machine, status_name=status, details=det, launch_file_path=launch_file, tcp_addrs=tcp_addr) if self.client_rq: msg_type = self.client_rq[0].type rq_sock = self.client_rq[1] if msg_type == "create_experiment": self.client_rq = None send_msg(rq_sock, info_msg) send_msg(sock, self.mtool.fill_msg("rq_ok", params=self._nearby_servers.dict_snapshot())) send_msg(self._publish_socket, info_msg) def _handle_register_experiment_timeout(self, exp): self.logger.error("New experiment process failed to " "register before timeout" + str(exp.pid)) if exp.returncode is None: exp.kill() exp.wait() # msg_type = self.client_rq[0].type rq_sock = self.client_rq[1] send_msg(rq_sock, self.mtool.fill_msg("rq_error", err_code="create_experiment_error", request=vars(self.client_rq[0]))) @msg_handlers.handler("register_peer") def handle_register_peer(self, message, sock): """Register peer""" if message.peer_type == "obci_client": send_msg(sock, self.mtool.fill_msg("rq_ok")) elif message.peer_type == "obci_experiment": self.handle_register_experiment(message, sock) else: super(OBCIServer, self).handle_register_peer(message, sock) @msg_handlers.handler("create_experiment") def handle_create_experiment(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return launch_file = message.launch_file sandbox = message.sandbox_dir name = message.name overwrites = message.overwrites sandbox = sandbox if sandbox else settings.DEFAULT_SANDBOX_DIR exp, details = self.start_experiment_process( sandbox, launch_file, name, overwrites) if exp is None: self.logger.error("failed to launch experiment " "process, request: " + str(vars(message))) send_msg(sock, self.mtool.fill_msg("rq_error", request=vars(message), err_code='launch_error', details=details)) else: self.logger.info("experiment process " "launched: {0}".format(exp.pid)) if sock.socket_type in [zmq.REP, zmq.ROUTER]: self.client_rq = (message, sock) @msg_handlers.handler("list_experiments") def handle_list_experiments(self, message, sock): exp_data = {} for exp_id in self.experiments: exp_data[exp_id] = self.experiments[exp_id].info() nearby = self.nearby_servers() nearby_dict = {} for srv in nearby.values(): nearby_dict[srv.ip] = srv.hostname info = '\n{' for srv in nearby_dict: info += '\n' + srv + ' : ' + nearby_dict[srv] + ',' info += '}' self.logger.debug("nearby servers: count: {0}, {1}".format( len(nearby), info)) send_msg(sock, self.mtool.fill_msg("running_experiments", exp_data=exp_data, nearby_machines=nearby_dict)) @msg_handlers.handler("list_nearby_machines") def handle_list_nearby_machines(self, message, sock): send_msg(sock, self.mtool.fill_msg('nearby_machines', nearby_machines=self._nearby_servers.dict_snapshot())) def _handle_match_name(self, message, sock, this_machine=False): matches = self.exp_matching(message.strname) match = None msg = None if not matches: msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='experiment_not_found') elif len(matches) > 1: matches = [(exp.uuid, exp.name) for exp in matches] msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='ambiguous_exp_name', details=matches) else: match = matches.pop() if this_machine and match.origin_machine != self.machine: msg = self.mtool.fill_msg("rq_error", request=vars(message), err_code='exp_not_on_this_machine', details=match.origin_machine) match = None if msg and sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, msg) return match @msg_handlers.handler("get_experiment_contact") def handle_get_experiment_contact(self, message, sock): self.logger.debug("##### rq contact for: %s", message.strname) info = self._handle_match_name(message, sock) if info: send_msg(sock, self.mtool.fill_msg("experiment_contact", uuid=info.uuid, name=info.name, rep_addrs=info.rep_addrs, pub_addrs=info.pub_addrs, tcp_addrs=info.tcp_addrs, machine=info.origin_machine, status_name=info.status_name, details=info.details)) @msg_handlers.handler("experiment_status_change") def handle_experiment_status_change(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.status_name = message.status_name exp.details = message.details if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("experiment_info_change") def handle_experiment_info_change(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: self.logger.warning("UUID not found " + message.uuid) if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.name = message.name exp.launch_file_path = message.launch_file_path if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) self.logger.info("INFO CHANGED %s", exp.launch_file_path) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("experiment_transformation") def handle_experiment_transformation(self, message, sock): exp = self.experiments.get(message.uuid, None) if not exp: if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_error', err_code='experiment_not_found')) return exp.status_name = message.status_name exp.details = message.details exp.launch_file_path = message.launch_file exp.name = message.name if sock.socket_type in [zmq.REP, zmq.ROUTER]: send_msg(sock, self.mtool.fill_msg('rq_ok')) send_msg(self._publish_socket, message.SerializeToString()) def exp_matching(self, strname): """Match *strname* against all created experiment IDs and names. Return those experiment descriptions which name or uuid starts with strname. """ match_names = {} for uid, exp in self.experiments.items(): if exp.name.startswith(strname): match_names[uid] = exp ids = self.experiments.keys() match_ids = [uid for uid in ids if uid.startswith(strname)] experiments = set() for uid in match_ids: experiments.add(self.experiments[uid]) for name, exp in match_names.items(): experiments.add(exp) return experiments @msg_handlers.handler("kill_experiment") def handle_kill_experiment(self, message, sock): match = self._handle_match_name(message, sock, this_machine=True) if match: if match.kill_timer is not None: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="already_killed", details="Experiment already shutting down")) elif not message.force: self.logger.info("sending kill to experiment " "{0} ({1})".format(match.uuid, match.name)) send_msg(self._publish_socket, # self.exp_pub, self.mtool.fill_msg("kill", receiver=match.uuid)) send_msg(sock, self.mtool.fill_msg("kill_sent", experiment_id=match.uuid)) pid = match.experiment_pid uid = match.uuid self.logger.info("Waiting for experiment process {0} to terminate".format(uid)) match.kill_timer = threading.Timer(1.1, self._handle_killing_exp, args=[pid, uid]) match.kill_timer.start() send_msg(self._publish_socket, self.mtool.fill_msg('kill_sent', experiment_id=match.uuid )) def _handle_killing_exp(self, pid, uid): proc = self.subprocess_mgr.process(self.machine, pid) if proc.process_is_running(): proc.kill() self.logger.info("experiment {0} FINISHED".format(uid)) proc.delete = True del self.experiments[uid] return proc.popen_obj.returncode @msg_handlers.handler("launch_process") def handle_launch_process(self, message, sock): if message.proc_type == 'obci_process_supervisor': self._handle_launch_process_supervisor(message, sock) def _handle_launch_process_supervisor(self, message, sock): sv_obj, details = self._start_obci_supervisor_process(message) self.logger.info("LAUNCH PROCESS SV " + str(sv_obj) + str(details)) if sv_obj: self.exp_process_supervisors[message.sender] = sv_obj send_msg(sock, self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=sv_obj.pid, proc_type=sv_obj.proc_type, name=sv_obj.name, path=sv_obj.path)) self.logger.info("CONFIRMED LAUNCH") else: send_msg(sock, self.mtool.fill_msg('rq_error', request=message.dict(), err_code="launch_error", details=details)) self.logger.error("PROCESS SUPERVISOR LAUNCH FAILURE") @msg_handlers.handler("kill_process") def handle_kill_process_supervisor(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if not proc: send_msg(sock, self.mtool.fill_msg("rq_error", err_code="process_not_found")) else: # TODO # name = proc.name proc.kill() proc.mark_delete() send_msg(sock, self.mtool.fill_msg("rq_ok")) del self.exp_process_supervisors[proc.name] @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() status, details = proc.status() self.logger.warning("Process " + proc.proc_type + " dead: " + status + str(details) + proc.name + str(proc.pid)) if proc.proc_type == 'obci_process_supervisor': pass elif proc.proc_type == 'obci_experiment': pass if status == subprocess_monitor.FAILED: pass @msg_handlers.handler("find_eeg_experiments") def handle_find_eeg_experiments(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) finder_thr = threading.Thread(target=find_eeg_experiments_and_push_results, args=[self.ctx, self.rep_addresses, message, self._nearby_servers.copy()]) finder_thr.daemon = True finder_thr.start() @msg_handlers.handler("find_eeg_amplifiers") def handle_find_new_eeg_amplifiers(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) amp_thr = threading.Thread(target=find_new_experiments_and_push_results, args=[self.ctx, message]) amp_thr.daemon = True amp_thr.start() @msg_handlers.handler("start_eeg_signal") def handle_start_eeg_signal(self, message, sock): if not self.network_ready() and self._nearby_servers.dict_snapshot(): send_msg(sock, self.mtool.fill_msg("rq_error", err_code='server_network_not_ready')) return send_msg(sock, self.mtool.fill_msg("rq_ok")) start_thr = threading.Thread(target=start_eeg_signal_experiment, args=[self.ctx, self.rep_addresses, message]) start_thr.daemon = True start_thr.start() def _start_obci_supervisor_process(self, rq_message): path = obci_process_supervisor.__file__ path = '.'.join([path.rsplit('.', 1)[0], 'py']) start_params = rq_message.dict() start_params['path'] = path del start_params['type'] del start_params['sender'] del start_params['sender_ip'] del start_params['receiver'] sv_obj, details = self.subprocess_mgr.new_local_process(**start_params) if sv_obj is None: return None, details return sv_obj, False def _crash_extra_data(self, exception=None): data = super(OBCIServer, self)._crash_extra_data(exception) data.update({ 'experiments': [e.info() for e in self.experiments.values()] }) return data
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() def __init__( self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid="", name="obci_process_supervisor", ): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name ) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_not_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): return dict( pid=os.getpid(), machine=self.machine, mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]], ) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params["launch_data"] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params["peer_order"] for part in self.peer_order: self._running_peer_order.append(list(part)) print self.name, "[", self.type, "]", "RECEIVED LAUNCH DATA: ", self.launch_data def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(":")[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str( sock.bind_to_random_port( "tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1] ) ) sock.close() return ("0.0.0.0", port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] print self.name, "[", self.type, "]", "mx addr str", addr + ":" + str(port) return addr + ":" + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] _env = { "MULTIPLEXER_ADDRESSES": socket.gethostname() + ":" + str(port), "MULTIPLEXER_PASSWORD": mx_data[1], "MULTIPLEXER_RULES": launcher_tools.mx_rules_path(), } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if "mx" in self.launch_data and self.mx_data[0] is not None: print self.name, "[", self.type, "]", "..starting multiplexer" self.peer_order.remove(["mx"]) self.peers_to_launch.remove("mx") path = launcher_tools.mx_path() args = [ "run_multiplexer", self.mx_addr_str((("0.0.0.0", self.mx_data[0][1]), self.mx_data[1])), "--multiplexer-password", self.mx_data[1], "--rules", launcher_tools.mx_rules_path(), ] proc, details = self._launch_process(path, args, "multiplexer", "mx", env=self.env) self.processes["mx"] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self._launch_processes(self.launch_data) def test(self): # for i in range(SEND): # send_msg(self.push, str(i)) self.pull = self.ctx.socket(zmq.SUB) self.pull.bind("tcp://*:16789") received = 0 prev = -1 for i in range(SEND): msg = recv_msg(self.pull) if int(msg): # prev = int(msg) received += 1 if received % 10000 == 0: print "zmq: received ", received, "messages, last: ", msg if received == SEND: print "zmq: OK" else: print "WUT?", received # self.push.close() self.pull.close() @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return message.kill_peers.append("config_server") message.start_peers_data["config_server"] = dict(self.launch_data["config_server"]) restore_config = [peer for peer in self.processes if peer not in message.kill_peers] for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: print self.name, "[", self.type, "]", "peer to kill not found:", peer continue print "MORPH: KILLING ", peer proc.kill() print "MORPH: KILLED ", peer del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data, restore_config=restore_config) def _launch_processes(self, launch_data, restore_config=[]): proc, details = None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if "config_server" in launch_data: ldata.append(("config_server", launch_data["config_server"])) if "amplifier" in launch_data: ldata.append(("amplifier", launch_data["amplifier"])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata: ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): wait = 0 if peer.startswith("mx"): continue path = os.path.join(launcher_tools.obci_root(), data["path"]) args = data["args"] if peer.startswith("config_server"): args += ["-p", "launcher_socket_addr", self.cs_addr] args += ["-p", "experiment_uuid", self.experiment_uuid] if restore_config: args += ["-p", "restore_peers", " ".join(restore_config)] wait = 0.4 proc, details = self._launch_process(path, args, data["peer_type"], peer, env=self.env, capture_io=NO_STDIO) if proc is not None: self.processes[peer] = proc else: success = False break time.sleep(wait) if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) else: print self.name, "[", self.type, "]", "OBCI LAUNCH FAILED" send_msg( self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details), ) self.processes = {} self.subprocess_mgr.killall() def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): proc, details = self.subprocess_mgr.new_local_process( path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env ) if proc is None: print self.name, "[", self.type, "]", "process launch FAILED:", path, args send_msg( self._publish_socket, self.mtool.fill_msg( "launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details), ), ) else: print self.name, "[", self.type, "]", "process launch success:", path, args, proc.pid send_msg( self._publish_socket, self.mtool.fill_msg( "launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args, ), ) return proc, details @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]["experiment_id"] txt = self.processes[peer].tail_stdout(lines=lines) send_msg( self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer), ) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall() @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name print "~~~~~ ~~~~~ ", name, self.restarting, message.status[0] if (proc.proc_type == "obci_peer" or proc.proc_type == "multiplexer") and not ( name in self.restarting and message.status[0] == "terminated" ): print "KILLLLLING and sending obci_peer_dead", proc.name send_msg( self._publish_socket, self.mtool.fill_msg( "obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status(), ), ) if name in self.restarting: self.restarting.remove(name) @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): print self.name, "got!", message.type send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} # self.subprocess_mgr.killall() def clean_up(self): print self.name, "[", self.type, "]", "cleaning up" self.processes = {} self.subprocess_mgr.killall() self.subprocess_mgr.delete_all()
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.__cfg_launch_info = None self.__cfg_morph = False self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] self.rqs = 0 self._nearby_machines = net.DNS() self.test_count = 0 self.__cfg_lock = threading.RLock() super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_not_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): mx_data = None if None not in self.mx_data: mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]] return dict(pid=os.getpid(), machine=self.machine, mx_data=mx_data) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data) def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] self.logger.info("mx addr str: " + addr + ':' + str(port)) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] if addr == '0.0.0.0': addr = socket.gethostname() _env = { "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port), "MULTIPLEXER_PASSWORD": '', # mx_data[1], "MULTIPLEXER_RULES": str(launcher_tools.mx_rules_path()) } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: self.logger.info("..starting multiplexer") self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.mx_path() args = ['run_multiplexer', self.mx_addr_str( (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])), '--multiplexer-password', self.mx_data[1], '--rules', launcher_tools.mx_rules_path()] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_config_server") def handle_start_config_srv(self, message, sock): if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) if "config_server" in self.launch_data: proc, details, wait, info_obj = \ self.launch_process("config_server", self.launch_data["config_server"], restore_config=message.restore_config) tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work) tim.start() def __if_config_server_conn_didnt_work(self): with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None self.logger.info("connection to config server is shaky :(") @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self.logger.info("start peers -- my mx_data: %s, received mx_data: %s", self.mx_data, message.mx_data) if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) # tmp.workarounds: wait for mx on other machine to initialize time.sleep(0.75) if message.add_launch_data: if self.machine in message.add_launch_data: self._launch_processes(message.add_launch_data[self.machine]) else: self._launch_processes(self.launch_data) @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: self.logger.error("peer to kill not found: %s", peer) continue self.logger.info("MORPH: KILLING %s ", peer) proc.kill_with_force() self.logger.info("MORPH: KILLED %s ", peer) del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data) def _launch_processes(self, launch_data, restore_config=[]): proc, details, info_obj = None, None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata and peer != 'config_server': ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): if peer.startswith('mx'): continue proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config) time.sleep(wait) if proc is None: success = False break if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) def launch_process(self, peer, launch_data, restore_config=[]): data = launch_data wait = 0 p = os.path.expanduser(data['path']) if not os.path.isabs(p): path = os.path.join(launcher_tools.obci_root(), p) path = os.path.abspath(path) else: path = os.path.realpath(p) args = data['args'] args = self._attach_base_config_path(path, args) args += ['-p', 'experiment_uuid', self.experiment_uuid] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] # wait = 0.5 if "log_dir" in args: idx = args.index("log_dir") + 1 log_dir = args[idx] log_dir = os.path.join(log_dir, self.name) args[idx] = log_dir else: log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name) args += ['-p', 'log_dir', log_dir] if not os.path.exists(log_dir): os.makedirs(log_dir) proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) info_obj = { "path": path, "args": args, "peer": peer } if proc is not None: self.processes[peer] = proc else: self.logger.error("OBCI LAUNCH FAILED") send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=info_obj['path'], args=info_obj['args'], details=details)) self.processes = {} self.subprocess_mgr.killall(force=True) return proc, details, wait, info_obj def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): self.logger.debug("launching..... %s %s", path, args) proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: self.logger.error("process launch FAILED: %s --- %s", path, str(args)) send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details, peer_id=name))) else: self.logger.info("process launch success:" + path + str(args) + str(proc.pid)) msg = self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args) if name == "config_server": self.__cfg_launch_info = msg else: send_msg(self._publish_socket, msg) return proc, details def _attach_base_config_path(self, launch_path, launch_args): peer_id = launch_args[0] base = launch_path.rsplit('.', 1)[0] ini = '.'.join([base, 'ini']) return [peer_id, ini] + launch_args[1:] @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler('nearby_machines') def handle_nearby_machines(self, message, sock): self._nearby_machines.mass_update(message.nearby_machines) @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("_kill_peer") def handle_kill_peer(self, message, sock): proc = self.processes.get(message.peer_id, None) if proc is not None: # is on this machine if message.morph and message.peer_id == 'config_server': self.__cfg_morph = True proc.kill_with_force() @msg_handlers.handler("rq_ok") def handle_rq_ok(self, message, sock): self.rqs += 1 # print "--> ", self.rqs if self.rqs == 10000: self.logger.debug("GOT %s %s", str(self.rqs), "messages!") self.rqs = 0 @msg_handlers.handler("experiment_launch_error") def handle_experiment_launch_error(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): self.logger.info("KILLLING! sending obci_peer_" "dead for process %s", proc.name) send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) if self.__cfg_morph and name == 'config_server': self.__cfg_morph = False @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): self.logger.info("got! " + message.type) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("config_server_ready") def handle_obci_peer_ready(self, message, sock): # config_server successfully connected to MX, now send "launched_process_info" with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} self.subprocess_mgr.killall(force=True) def clean_up(self): self.logger.info("cleaning up") self.processes = {} self.subprocess_mgr.killall(force=True) self.subprocess_mgr.delete_all() def _crash_extra_data(self, exception=None): data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception) data.update({ 'experiment_uuid': self.experiment_uuid, 'name': self.name }) return data
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() @log_crash def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.__cfg_launch_info = None self.__cfg_morph = False self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] self.rqs = 0 self._nearby_machines = net.DNS() self.test_count = 0 self.__cfg_lock = threading.RLock() super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid, logger=self.logger) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt_string(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_not_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): mx_data = None if None not in self.mx_data: mx_data = [self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]] return dict(pid=os.getpid(), machine=self.machine, mx_data=mx_data) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) self.logger.info("RECEIVED LAUNCH DATA: %s", self.launch_data) def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" # empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] self.logger.info("mx addr str: " + addr + ':' + str(port)) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] if addr == '0.0.0.0': addr = socket.gethostname() _env = { "MULTIPLEXER_ADDRESSES": str(addr) + ':' + str(port) } env.update(_env) return env @msg_handlers.handler("start_broker") def handle_start_broker(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: self.logger.info("..starting multiplexer") self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.broker_path() args = [ 'run_multiplexer', self.mx_addr_str((('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])) ] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_config_server") def handle_start_config_srv(self, message, sock): if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) if "config_server" in self.launch_data: proc, details, wait, info_obj = \ self.launch_process("config_server", self.launch_data["config_server"], restore_config=message.restore_config) tim = threading.Timer(1.5, self.__if_config_server_conn_didnt_work) tim.start() def __if_config_server_conn_didnt_work(self): with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None self.logger.info("connection to config server is shaky :(") @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self.logger.info("start peers -- my mx_data: %s, received mx_data: %s", self.mx_data, message.mx_data) if 'mx' not in self.launch_data: mx_addr = message.mx_data[1].split(':') mx_addr[1] = int(mx_addr[1]) md = list(self.mx_data) md[0] = tuple(mx_addr) self.mx_data = tuple(md) self.env = self.peer_env(self.mx_data) # tmp.workarounds: wait for mx on other machine to initialize time.sleep(0.75) if message.add_launch_data: if self.machine in message.add_launch_data: self._launch_processes(message.add_launch_data[self.machine]) else: self._launch_processes(self.launch_data) @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: self.logger.error("peer to kill not found: %s", peer) continue self.logger.info("MORPH: KILLING %s ", peer) proc.kill_with_force() self.logger.info("MORPH: KILLED %s ", peer) del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.items(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data) def _launch_processes(self, launch_data, restore_config=[]): proc, details, info_obj = None, None, None success = True self.status = launcher_tools.LAUNCHING ldata = [] if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.items(): if (peer, data) not in ldata and peer != 'config_server': ldata.append((peer, data)) for peer, data in ldata: # self.launch_data.iteritems(): if peer.startswith('mx'): continue proc, details, wait, info_obj = self.launch_process(peer, data, restore_config=restore_config) time.sleep(wait) if proc is None: success = False break if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) def launch_process(self, peer, launch_data, restore_config=[]): data = launch_data wait = 0 p = os.path.expanduser(data['path']) if not os.path.isabs(p): path = os.path.join(launcher_tools.obci_root(), p) path = os.path.abspath(path) else: path = os.path.realpath(p) args = data['args'] args = self._attach_base_config_path(path, args) args += ['-p', 'experiment_uuid', self.experiment_uuid] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] # wait = 0.5 if "log_dir" in args: idx = args.index("log_dir") + 1 log_dir = args[idx] log_dir = os.path.join(log_dir, self.name) args[idx] = log_dir else: log_dir = os.path.join(CONFIG_DEFAULTS["log_dir"], self.name) args += ['-p', 'log_dir', log_dir] if not os.path.exists(log_dir): os.makedirs(log_dir) proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) info_obj = { "path": path, "args": args, "peer": peer } if proc is not None: self.processes[peer] = proc else: self.logger.error("OBCI LAUNCH FAILED") send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=info_obj['path'], args=info_obj['args'], details=details)) self.processes = {} self.subprocess_mgr.killall(force=True) return proc, details, wait, info_obj def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): self.logger.debug("launching..... %s %s", path, args) proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: self.logger.error("process launch FAILED: %s --- %s", path, str(args)) send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details, peer_id=name))) else: self.logger.info("process launch success:" + path + str(args) + str(proc.pid)) msg = self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args) if name == "config_server": self.__cfg_launch_info = msg else: send_msg(self._publish_socket, msg) return proc, details def _attach_base_config_path(self, launch_path, launch_args): peer_id = launch_args[0] base = launch_path.rsplit('.', 1)[0] ini = '.'.join([base, 'ini']) return [peer_id, ini] + launch_args[1:] @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler('nearby_machines') def handle_nearby_machines(self, message, sock): self._nearby_machines.mass_update(message.nearby_machines) @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("_kill_peer") def handle_kill_peer(self, message, sock): proc = self.processes.get(message.peer_id, None) if proc is not None: # is on this machine if message.morph and message.peer_id == 'config_server': self.__cfg_morph = True proc.kill_with_force() @msg_handlers.handler("rq_ok") def handle_rq_ok(self, message, sock): self.rqs += 1 # print "--> ", self.rqs if self.rqs == 10000: self.logger.debug("GOT %s %s", str(self.rqs), "messages!") self.rqs = 0 @msg_handlers.handler("experiment_launch_error") def handle_experiment_launch_error(self, message, sock): self.subprocess_mgr.killall(force=True) @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): self.logger.info("KILLLING! sending obci_peer_" "dead for process %s", proc.name) send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) if self.__cfg_morph and name == 'config_server': self.__cfg_morph = False @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): self.logger.info("got! " + message.type) send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("config_server_ready") def handle_obci_peer_ready(self, message, sock): # config_server successfully connected to MX, now send "launched_process_info" with self.__cfg_lock: if self.__cfg_launch_info: send_msg(self._publish_socket, self.__cfg_launch_info) self.__cfg_launch_info = None @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} self.subprocess_mgr.killall(force=True) def clean_up(self): self.logger.info("cleaning up") self.processes = {} self.subprocess_mgr.killall(force=True) self.subprocess_mgr.delete_all() def _crash_extra_data(self, exception=None): data = super(OBCIProcessSupervisor, self)._crash_extra_data(exception) data.update({ 'experiment_uuid': self.experiment_uuid, 'name': self.name }) return data
class OBCIProcessSupervisor(OBCIControlPeer): msg_handlers = OBCIControlPeer.msg_handlers.copy() def __init__(self, sandbox_dir, source_addresses=None, source_pub_addresses=None, rep_addresses=None, pub_addresses=None, experiment_uuid='', name='obci_process_supervisor'): self.peers = {} self.status = launcher_tools.READY_TO_LAUNCH self.source_pub_addresses = source_pub_addresses self.machine = socket.gethostname() self.sandbox_dir = sandbox_dir if sandbox_dir else settings.DEFAULT_SANDBOX_DIR self.ctx = zmq.Context() self.mx_data = self.set_mx_data() self.env = self.peer_env(self.mx_data) self.launch_data = [] self.peer_order = [] self._running_peer_order = [] self._current_part = None self.experiment_uuid = experiment_uuid self.peers_to_launch = [] self.processes = {} self.restarting = [] super(OBCIProcessSupervisor, self).__init__( source_addresses=source_addresses, rep_addresses=rep_addresses, pub_addresses=pub_addresses, name=name) self.subprocess_mgr = SubprocessMonitor(self.ctx, self.uuid) def peer_type(self): return "obci_process_supervisor" def net_init(self): self.source_sub_socket = self.ctx.socket(zmq.SUB) self.source_sub_socket.setsockopt(zmq.SUBSCRIBE, "") self._all_sockets.append(self.source_sub_socket) if self.source_pub_addresses: for addr in self.source_pub_addresses: self.source_sub_socket.connect(addr) (self.config_server_socket, self.cs_addresses) = self._init_socket([], zmq.PULL) # self.config_server_socket.setsockopt(zmq.SUBSCRIBE, "") self.cs_addr = net.choose_not_local(self.cs_addresses) if not self.cs_addr: self.cs_addr = net.choose_local(self.cs_addresses)[0] else: self.cs_addr = self.cs_addr[0] self._all_sockets.append(self.config_server_socket) super(OBCIProcessSupervisor, self).net_init() def params_for_registration(self): return dict(pid=os.getpid(), machine=self.machine, mx_data=[self.mx_addr_str(((socket.gethostname(), self.mx_data[0][1]), self.mx_data[1])), self.mx_data[1]]) def custom_sockets(self): return [self.source_sub_socket, self.config_server_socket] def _handle_registration_response(self, response): self.launch_data = response.params['launch_data'] self.peers_to_launch = list(self.launch_data.keys()) self.peer_order = response.params['peer_order'] for part in self.peer_order: self._running_peer_order.append(list(part)) print self.name,'[', self.type, ']', "RECEIVED LAUNCH DATA: ", self.launch_data def set_mx_data(self): src_ = net.choose_not_local(self.source_pub_addresses)[:1] if not src_: src_ = net.choose_local(self.source_pub_addresses, ip=True)[:1] src = src_[0] src = src[6:].split(':')[0] if src == socket.gethostname(): sock = self.ctx.socket(zmq.REP) port = str(sock.bind_to_random_port("tcp://127.0.0.1", min_port=settings.PORT_RANGE[0], max_port=settings.PORT_RANGE[1])) sock.close() return ('0.0.0.0', port), "" #empty passwd else: return None, None def mx_addr_str(self, mx_data): if mx_data[0] is None: return None addr, port = mx_data[0] print self.name,'[', self.type, ']', "mx addr str", addr + ':' + str(port) return addr + ':' + str(port) def peer_env(self, mx_data): if mx_data[0] is None: return None env = os.environ.copy() addr, port = mx_data[0] _env = { "MULTIPLEXER_ADDRESSES": socket.gethostname() + ':' + str(port), "MULTIPLEXER_PASSWORD": mx_data[1], "MULTIPLEXER_RULES": launcher_tools.mx_rules_path() } env.update(_env) return env @msg_handlers.handler("start_mx") def handle_start_mx(self, message, sock): if 'mx' in self.launch_data and self.mx_data[0] is not None: print self.name,'[', self.type, ']', "..starting multiplexer" self.peer_order.remove(['mx']) self.peers_to_launch.remove('mx') path = launcher_tools.mx_path() args = ['run_multiplexer', self.mx_addr_str( (('0.0.0.0', self.mx_data[0][1]), self.mx_data[1])), '--multiplexer-password', self.mx_data[1], '--rules', launcher_tools.mx_rules_path()] proc, details = self._launch_process(path, args, 'multiplexer', 'mx', env=self.env) self.processes['mx'] = proc if proc is not None: self.mx = proc @msg_handlers.handler("start_peers") def handle_start_peers(self, message, sock): self._launch_processes(self.launch_data) def test(self): # for i in range(SEND): # send_msg(self.push, str(i)) self.pull = self.ctx.socket(zmq.SUB) self.pull.bind('tcp://*:16789') received = 0 prev = -1 for i in range(SEND): msg = recv_msg(self.pull) if int(msg): # prev = int(msg) received += 1 if received % 10000 == 0: print "zmq: received ", received, "messages, last: ", msg if received == SEND: print "zmq: OK" else: print "WUT?", received # self.push.close() self.pull.close() @msg_handlers.handler("manage_peers") def handle_manage_peers(self, message, sock): if not message.receiver == self.uuid: return message.kill_peers.append('config_server') message.start_peers_data['config_server'] = dict(self.launch_data['config_server']) restore_config = [peer for peer in self.processes if peer not in message.kill_peers] for peer in message.kill_peers: proc = self.processes.get(peer, None) if not proc: print self.name,'[', self.type, ']', "peer to kill not found:", peer continue print "MORPH: KILLING ", peer proc.kill() print "MORPH: KILLED ", peer del self.processes[peer] del self.launch_data[peer] for peer, data in message.start_peers_data.iteritems(): self.launch_data[peer] = data self.restarting = [peer for peer in message.start_peers_data if peer in message.kill_peers] self._launch_processes(message.start_peers_data, restore_config=restore_config) def _launch_processes(self, launch_data, restore_config=[]): proc, details = None, None success = True path, args = None, None self.status = launcher_tools.LAUNCHING ldata = [] if 'config_server' in launch_data: ldata.append(('config_server', launch_data['config_server'])) if 'amplifier' in launch_data: ldata.append(('amplifier', launch_data['amplifier'])) for peer, data in launch_data.iteritems(): if (peer, data) not in ldata: ldata.append((peer, data)) for peer, data in ldata:#self.launch_data.iteritems(): wait = 0 if peer.startswith('mx'): continue path = os.path.join(launcher_tools.obci_root(), data['path']) args = data['args'] if peer.startswith('config_server'): args += ['-p', 'launcher_socket_addr', self.cs_addr] args += ['-p', 'experiment_uuid', self.experiment_uuid] if restore_config: args += ['-p', 'restore_peers', ' '.join(restore_config)] wait = 0.4 proc, details = self._launch_process(path, args, data['peer_type'], peer, env=self.env, capture_io=NO_STDIO) if proc is not None: self.processes[peer] = proc else: success = False break time.sleep(wait) if success: send_msg(self._publish_socket, self.mtool.fill_msg("all_peers_launched", machine=self.machine)) else: print self.name,'[', self.type, ']', "OBCI LAUNCH FAILED" send_msg(self._publish_socket, self.mtool.fill_msg("obci_launch_failed", machine=self.machine, path=path, args=args, details=details)) self.processes = {} self.subprocess_mgr.killall() def _launch_process(self, path, args, proc_type, name, env=None, capture_io=NO_STDIO): proc, details = self.subprocess_mgr.new_local_process(path, args, proc_type=proc_type, name=name, monitoring_optflags=RETURNCODE, capture_io=capture_io, env=env) if proc is None: print self.name,'[', self.type, ']', "process launch FAILED:", path, args send_msg(self._publish_socket, self.mtool.fill_msg("launch_error", sender=self.uuid, details=dict(machine=self.machine, path=path, args=args, error=details))) else: print self.name,'[', self.type, ']', "process launch success:", path, args, proc.pid send_msg(self._publish_socket, self.mtool.fill_msg("launched_process_info", sender=self.uuid, machine=self.machine, pid=proc.pid, proc_type=proc_type, name=name, path=path, args=args)) return proc, details @msg_handlers.handler("get_tail") def handle_get_tail(self, message, sock): lines = message.len if message.len else DEFAULT_TAIL_RQ peer = message.peer_id if peer not in self.launch_data: return experiment_id = self.launch_data[peer]['experiment_id'] txt = self.processes[peer].tail_stdout(lines=lines) send_msg(self._publish_socket, self.mtool.fill_msg("tail", txt=txt, sender=self.uuid, experiment_id=experiment_id, peer_id=peer)) @msg_handlers.handler("experiment_finished") def handle_experiment_finished(self, message, sock): pass @msg_handlers.handler("morph_to_new_scenario") def handle_morph(self, message, sock): pass @msg_handlers.handler("stop_all") def handle_stop_all(self, message, sock): self.subprocess_mgr.killall() @msg_handlers.handler("dead_process") def handle_dead_process(self, message, sock): proc = self.subprocess_mgr.process(message.machine, message.pid) if proc is not None: proc.mark_delete() name = proc.name print '~~~~~ ~~~~~ ', name, self.restarting, message.status[0] if (proc.proc_type == 'obci_peer' or proc.proc_type == 'multiplexer') and \ not (name in self.restarting and message.status[0] == 'terminated'): print "KILLLLLING and sending obci_peer_dead", proc.name send_msg(self._publish_socket, self.mtool.fill_msg("obci_peer_dead", sender=self.uuid, sender_ip=self.machine, peer_id=proc.name, path=proc.path, status=proc.status() )) if name in self.restarting: self.restarting.remove(name) @msg_handlers.handler("obci_peer_registered") def handle_obci_peer_registered(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_params_changed") def handle_obci_peer_params_changed(self, message, sock): send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_peer_ready") def handle_obci_peer_ready(self, message, sock): print self.name , "got!", message.type send_msg(self._publish_socket, message.SerializeToString()) @msg_handlers.handler("obci_control_message") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("obci_peer_dead") def handle_obci_control_message(self, message, sock): # ignore :) pass @msg_handlers.handler("process_supervisor_registered") def handle_supervisor_registered(self, messsage, sock): # also ignore pass def cleanup_before_net_shutdown(self, kill_message, sock=None): self.processes = {} #self.subprocess_mgr.killall() def clean_up(self): print self.name,'[', self.type, ']', "cleaning up" self.processes = {} self.subprocess_mgr.killall() self.subprocess_mgr.delete_all()