def _handle_command_setup(self, req): if self.pipeline is not None and not self.pipeline.is_reset(): return { "node": req["node"], "command": Command.ERROR, "describtion": "previous pipeline is still running" } taskchain = [(t["function"], t.get("args", set()), t["is_generator"]) for t in req["tasks"]] try: self.pipeline = pipeline.Pipeline(self.log_addr, name=req["name"], version=req["version"], taskchain=taskchain, benchmark_folder=self.benchmark_folder) except AssertionError as e: self.pipeline = None return { "node": req["node"], "command": Command.ERROR, "describtion": traceback.format_exc() } n_workers = req["n_workers"] self.pipeline.boot(n_workers=n_workers if n_workers is not None else mp.cpu_count()) mpl.log("pipeline setup: " + self.pipeline.get_name() + " v" + str(self.pipeline.get_version())) return { "node": req["node"], "command": Command.SETUP }
def serve(self): signal.signal(signal.SIGINT, self._signal_handler) signal.signal(signal.SIGTERM, self._signal_handler) self.start_time = time.time() mpl.log("node server started at " + self.addr[0] + ":" + str(self.addr[1]), self.log_addr) with mpc.Listener(self.addr, "AF_INET", self.conn_buffer_size, None) as lst: while True: conn = lst.accept() threading.Thread(target=self._handle_connection, args=(conn, lst.last_accepted), daemon=True).start()
def _handle_command_reset(self, req): if self.pipeline is not None: self.pipeline.reset() self.pipeline = None mpl.log("pipeline reset", self.log_addr) return { "node": req["node"], "command": Command.RESET }
def _check_nodes(node_addrs, output_queue, verbose=False): for addr in node_addrs: node_client = node.Client(addr) try: node_status = node_client.send_command_status(retry=False) except: continue if not node_status["running"]: if verbose: mpl.log("node found at " + pipeline_cluster.util.str_addr(addr)) output_queue.put(node_client) else: raise RuntimeError( "node at " + pipeline_cluster.util.str_addr(addr) + " is not reset. Is there another root running?\nThe pipeline-cluster only supports one root at a time." )
def _handle_connection(self, conn, caddr): while True: try: req = conn.recv() try: response = self._handle_request(conn, req) except Exception as e: mpl.log("exception occured during request " + str(req) + "\n" + traceback.format_exc(), addr=self.log_addr) if self.pipeline is not None: mpl.log("reset pipeline") try: self.pipeline.reset() self.pipeline = None except Exception as e: mpl.log("failed to reset pipeline, maybe you have to kill the workers manually", addr=self.log_addr) conn.send({ "command": Command.INTERNAL_ERROR, "describtion": "internal error occured" }) exit(1) conn.send(response) except EOFError as e: # maybe this should catch all exceptions in case the client disconnects while sending break except ConnectionResetError as e: break conn.close()
def search_nodes(self, network="127.0.0.0/24", port=6000, verbose=False): with self.scheduler_state_cond: if not self.is_reset: raise RuntimeError( "The pipeline-cluster has to be reset to be able to add more nodes" ) network = ipaddress.ip_network(network, strict=False) if verbose: mpl.log("scanning network " + str(network) + " for nodes on port " + str(port) + " (" + str(network.num_addresses) + " hosts)") n_nodes = len(self.node_clients) thrs = [] node_client_queue = queue.Queue() for addrs_chunk in pipeline_cluster.util.chunks( [(str(h), port) for h in network if (str(h), port) not in [n.addr for n in self.node_clients]], network.num_addresses // mp.cpu_count() + 1): thr = threading.Thread(target=Root._check_nodes, args=(addrs_chunk, node_client_queue, verbose)) thrs.append(thr) thr.start() for thr in thrs: thr.join() while True: try: self.node_clients.append(node_client_queue.get_nowait()) except: break if verbose: mpl.log("finished scanning network") return len(self.node_clients) - n_nodes
def _handle_command_environment(self, req, conn): WORKING_DIR = os.path.expanduser("~/.pipeline-cluster") if not os.path.isdir(WORKING_DIR): os.makedirs(WORKING_DIR) for package in req["local"]: util.dict_to_dir(WORKING_DIR, package) package_path = os.path.join(WORKING_DIR, list(package)[0]) package_name = os.path.basename(package_path) mpl.log("install local package: " + package_name, addr=self.log_addr) subprocess.call(["pip", "install", package_path], shell=False) for package_name in req["remote"]: subprocess.call(["pip", "install", package_name], shell=False) mpl.log("install remote package: " + package_name, addr=self.log_addr) mpl.log("finished installing packages, restart server") conn.send({ "node": req["node"], "command": Command.ENVIRONMENT }) os.execv(sys.executable, [sys.executable] + sys.argv)
def _worker_routine(taskchain, log_addr, new_items_counter, idle_counter, sleep_counter, terminate_counter, state_cond, benchmark_folder): """ The worker routine for pipeline workers. Workers feed items in deepest first order. They passively wait for input if no more items are available. When the sleep_counter is set, the workers finish the current task and sleep until the counter is unset, even if there are items to process. """ # workers should only be terminated with the terminate counter from the root pipeline process # thus signals get ignored signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGTERM, signal.SIG_IGN) # TODO: ignore/handle more signals if needed mpl.set_default_address(log_addr) benchmark_file = os.path.join(benchmark_folder, str(os.getpid())) benchmark = [{ "task": t.name, "processed": 0, "time": 0 } for t in taskchain] # prepare classes taskchain taskchain = [ Task(t.function(*t.args), *t[1:]) if t.is_class else t for t in taskchain ] while True: with open(benchmark_file, "w") as fd: json.dump(benchmark, fd) with state_cond: if terminate_counter.value() == 1: exit(0) if sleep_counter.value() == 1 or new_items_counter.value() == 0: idle_counter.inc() state_cond.notify_all() while True: state_cond.wait() if terminate_counter.value() == 1: exit(0) if not sleep_counter.value( ) == 1 and new_items_counter.value() != 0: break idle_counter.dec() for i in reversed(range(len(taskchain))): curr_task = taskchain[i] curr_benchmark = benchmark[i] if curr_task.input_buffer is not None and not curr_task.input_buffer.empty( ): try: item = curr_task.input_buffer.dequeue() except: # at this point the buffer recently had at least one item # because there is none currently another another process removed one right before # because of the semaphore counter there must be another item in a task before this # thus continue not break continue with state_cond: new_items_counter.dec() # handle first item start_time = time.time() try: item = curr_task.function(item) except Exception: mpl.log("Item dropped due to exception:\n" + traceback.format_exc()) break d_time = time.time() - start_time curr_benchmark["time"] += d_time curr_benchmark["processed"] += 1 if item is None: break is_last_task = (i == len(taskchain) - 1) if curr_task.output_buffer is not None: if curr_task.is_generator: if not item: break curr_task.output_buffer.enqueue(*item) if not is_last_task: with state_cond: new_items_counter.inc(len(item)) state_cond.notify_all() else: with state_cond: state_cond.notify_all() else: curr_task.output_buffer.enqueue(item) if not is_last_task: with state_cond: new_items_counter.inc() state_cond.notify_all() else: with state_cond: state_cond.notify_all() break # feed forward for j in range(i + 1, len(taskchain)): with state_cond: if terminate_counter.value() == 1: exit(0) curr_task = taskchain[j] curr_benchmark = benchmark[j] start_time = time.time() try: item = curr_task.function(item) except Exception: mpl.log("Item dropped due to exception:\n" + traceback.format_exc()) break d_time = time.time() - start_time curr_benchmark["time"] += d_time curr_benchmark["processed"] += 1 if item is None: break is_last_task = j == len(taskchain) - 1 if curr_task.output_buffer is not None: if curr_task.is_generator: if not item: break curr_task.output_buffer.enqueue(*item) if not is_last_task: with state_cond: new_items_counter.inc(len(item)) state_cond.notify_all() else: with state_cond: state_cond.notify_all() else: curr_task.output_buffer.enqueue(item) if not is_last_task: with state_cond: new_items_counter.inc() state_cond.notify_all() else: with state_cond: state_cond.notify_all() break break
def _worker_signal_handler(signum, frame): print(signum) mpl.log("worker terminated") exit(1)
def __call__(self, item): mpl.log(self.message) return item
def dummy(item): mpl.log("dummy message") return item