Esempio n. 1
0
    def _handle_command_setup(self, req):
        if self.pipeline is not None and not self.pipeline.is_reset():
            return {
                "node": req["node"],
                "command": Command.ERROR,
                "describtion": "previous pipeline is still running"
            }

        taskchain = [(t["function"], t.get("args", set()), t["is_generator"]) for t in req["tasks"]]
        try:
            self.pipeline = pipeline.Pipeline(self.log_addr, name=req["name"], version=req["version"], taskchain=taskchain, benchmark_folder=self.benchmark_folder)
        except AssertionError as e:
            self.pipeline = None
            return {
                "node": req["node"],
                "command": Command.ERROR, 
                "describtion": traceback.format_exc()
            }

        n_workers = req["n_workers"]
        self.pipeline.boot(n_workers=n_workers if n_workers is not None else mp.cpu_count())
        
        mpl.log("pipeline setup: " + self.pipeline.get_name() + " v" + str(self.pipeline.get_version()))
        return {
            "node": req["node"],
            "command": Command.SETUP
        }
Esempio n. 2
0
 def serve(self):
     signal.signal(signal.SIGINT, self._signal_handler)
     signal.signal(signal.SIGTERM, self._signal_handler)
     self.start_time = time.time()
     mpl.log("node server started at " + self.addr[0] + ":" + str(self.addr[1]), self.log_addr)
     with mpc.Listener(self.addr, "AF_INET", self.conn_buffer_size, None) as lst:
         while True:
             conn = lst.accept()
             threading.Thread(target=self._handle_connection, args=(conn, lst.last_accepted), daemon=True).start()
Esempio n. 3
0
    def _handle_command_reset(self, req):
        if self.pipeline is not None:
            self.pipeline.reset()
            self.pipeline = None
            mpl.log("pipeline reset", self.log_addr)

        return {
            "node": req["node"],
            "command": Command.RESET
        }
Esempio n. 4
0
    def _check_nodes(node_addrs, output_queue, verbose=False):
        for addr in node_addrs:
            node_client = node.Client(addr)
            try:
                node_status = node_client.send_command_status(retry=False)
            except:
                continue

            if not node_status["running"]:
                if verbose:
                    mpl.log("node found at " +
                            pipeline_cluster.util.str_addr(addr))
                output_queue.put(node_client)
            else:
                raise RuntimeError(
                    "node at " + pipeline_cluster.util.str_addr(addr) +
                    " is not reset. Is there another root running?\nThe pipeline-cluster only supports one root at a time."
                )
Esempio n. 5
0
    def _handle_connection(self, conn, caddr):
        while True:
            try:
                req = conn.recv()
                try:
                    response = self._handle_request(conn, req)
                except Exception as e:
                    mpl.log("exception occured during request " + str(req) + "\n" + traceback.format_exc(), addr=self.log_addr)
                    if self.pipeline is not None:
                        mpl.log("reset pipeline")
                        try:
                            self.pipeline.reset()
                            self.pipeline = None
                        except Exception as e:
                            mpl.log("failed to reset pipeline, maybe you have to kill the workers manually", addr=self.log_addr)

                    conn.send({
                        "command": Command.INTERNAL_ERROR,
                        "describtion": "internal error occured"
                    })

                    exit(1)

                conn.send(response)
            except EOFError as e: # maybe this should catch all exceptions in case the client disconnects while sending
                break
            except ConnectionResetError as e:
                break
        
        conn.close()
Esempio n. 6
0
    def search_nodes(self, network="127.0.0.0/24", port=6000, verbose=False):
        with self.scheduler_state_cond:
            if not self.is_reset:
                raise RuntimeError(
                    "The pipeline-cluster has to be reset to be able to add more nodes"
                )

        network = ipaddress.ip_network(network, strict=False)
        if verbose:
            mpl.log("scanning network " + str(network) +
                    " for nodes on port " + str(port) + " (" +
                    str(network.num_addresses) + " hosts)")

        n_nodes = len(self.node_clients)
        thrs = []
        node_client_queue = queue.Queue()
        for addrs_chunk in pipeline_cluster.util.chunks(
            [(str(h), port) for h in network
             if (str(h), port) not in [n.addr for n in self.node_clients]],
                network.num_addresses // mp.cpu_count() + 1):
            thr = threading.Thread(target=Root._check_nodes,
                                   args=(addrs_chunk, node_client_queue,
                                         verbose))
            thrs.append(thr)
            thr.start()

        for thr in thrs:
            thr.join()

        while True:
            try:
                self.node_clients.append(node_client_queue.get_nowait())
            except:
                break

        if verbose:
            mpl.log("finished scanning network")

        return len(self.node_clients) - n_nodes
Esempio n. 7
0
    def _handle_command_environment(self, req, conn):
        WORKING_DIR = os.path.expanduser("~/.pipeline-cluster")

        if not os.path.isdir(WORKING_DIR):
            os.makedirs(WORKING_DIR)

        for package in req["local"]:
            util.dict_to_dir(WORKING_DIR, package)
            package_path = os.path.join(WORKING_DIR, list(package)[0])
            package_name = os.path.basename(package_path)
            mpl.log("install local package: " + package_name, addr=self.log_addr)
            subprocess.call(["pip", "install", package_path], shell=False)

        for package_name in req["remote"]:
            subprocess.call(["pip", "install", package_name], shell=False)
            mpl.log("install remote package: " + package_name, addr=self.log_addr)


        mpl.log("finished installing packages, restart server")
        conn.send({
            "node": req["node"],
            "command": Command.ENVIRONMENT
        })
        os.execv(sys.executable, [sys.executable] + sys.argv)
Esempio n. 8
0
def _worker_routine(taskchain, log_addr, new_items_counter, idle_counter,
                    sleep_counter, terminate_counter, state_cond,
                    benchmark_folder):
    """
    The worker routine for pipeline workers.
    Workers feed items in deepest first order.
    They passively wait for input if no more items are available.
    When the sleep_counter is set, the workers finish the current task and sleep until the counter is unset,
    even if there are items to process.
    """

    # workers should only be terminated with the terminate counter from the root pipeline process
    # thus signals get ignored
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGTERM, signal.SIG_IGN)
    # TODO: ignore/handle more signals if needed

    mpl.set_default_address(log_addr)
    benchmark_file = os.path.join(benchmark_folder, str(os.getpid()))
    benchmark = [{
        "task": t.name,
        "processed": 0,
        "time": 0
    } for t in taskchain]

    # prepare classes taskchain
    taskchain = [
        Task(t.function(*t.args), *t[1:]) if t.is_class else t
        for t in taskchain
    ]

    while True:
        with open(benchmark_file, "w") as fd:
            json.dump(benchmark, fd)

        with state_cond:
            if terminate_counter.value() == 1:
                exit(0)

            if sleep_counter.value() == 1 or new_items_counter.value() == 0:
                idle_counter.inc()
                state_cond.notify_all()

                while True:
                    state_cond.wait()
                    if terminate_counter.value() == 1:
                        exit(0)

                    if not sleep_counter.value(
                    ) == 1 and new_items_counter.value() != 0:
                        break

                idle_counter.dec()

        for i in reversed(range(len(taskchain))):
            curr_task = taskchain[i]
            curr_benchmark = benchmark[i]
            if curr_task.input_buffer is not None and not curr_task.input_buffer.empty(
            ):
                try:
                    item = curr_task.input_buffer.dequeue()
                except:
                    # at this point the buffer recently had at least one item
                    # because there is none currently another another process removed one right before
                    # because of the semaphore counter there must be another item in a task before this
                    # thus continue not break
                    continue

                with state_cond:
                    new_items_counter.dec()

                # handle first item
                start_time = time.time()
                try:
                    item = curr_task.function(item)
                except Exception:
                    mpl.log("Item dropped due to exception:\n" +
                            traceback.format_exc())
                    break
                d_time = time.time() - start_time
                curr_benchmark["time"] += d_time
                curr_benchmark["processed"] += 1

                if item is None:
                    break

                is_last_task = (i == len(taskchain) - 1)
                if curr_task.output_buffer is not None:
                    if curr_task.is_generator:
                        if not item:
                            break
                        curr_task.output_buffer.enqueue(*item)
                        if not is_last_task:
                            with state_cond:
                                new_items_counter.inc(len(item))
                                state_cond.notify_all()
                        else:
                            with state_cond:
                                state_cond.notify_all()
                    else:
                        curr_task.output_buffer.enqueue(item)
                        if not is_last_task:
                            with state_cond:
                                new_items_counter.inc()
                                state_cond.notify_all()
                        else:
                            with state_cond:
                                state_cond.notify_all()
                    break

                # feed forward
                for j in range(i + 1, len(taskchain)):
                    with state_cond:
                        if terminate_counter.value() == 1:
                            exit(0)

                    curr_task = taskchain[j]
                    curr_benchmark = benchmark[j]

                    start_time = time.time()
                    try:
                        item = curr_task.function(item)
                    except Exception:
                        mpl.log("Item dropped due to exception:\n" +
                                traceback.format_exc())
                        break
                    d_time = time.time() - start_time
                    curr_benchmark["time"] += d_time
                    curr_benchmark["processed"] += 1

                    if item is None:
                        break
                    is_last_task = j == len(taskchain) - 1
                    if curr_task.output_buffer is not None:
                        if curr_task.is_generator:
                            if not item:
                                break
                            curr_task.output_buffer.enqueue(*item)
                            if not is_last_task:
                                with state_cond:
                                    new_items_counter.inc(len(item))
                                    state_cond.notify_all()
                            else:
                                with state_cond:
                                    state_cond.notify_all()
                        else:
                            curr_task.output_buffer.enqueue(item)
                            if not is_last_task:
                                with state_cond:
                                    new_items_counter.inc()
                                    state_cond.notify_all()
                            else:
                                with state_cond:
                                    state_cond.notify_all()
                        break

                break
Esempio n. 9
0
def _worker_signal_handler(signum, frame):
    print(signum)
    mpl.log("worker terminated")
    exit(1)
Esempio n. 10
0
 def __call__(self, item):
     mpl.log(self.message)
     return item
Esempio n. 11
0
def dummy(item):
    mpl.log("dummy message")
    return item