Beispiel #1
0
def run(graph_filename=None, start_port=9000):
    """Instantiates and runs the dataflow graph.

    ERDOS will spawn 1 process for each python operator, and connect them via
    TCP.

    Args:
        graph_filename (str): the filename to which to write the dataflow graph
            as a DOT file.
        start_port (int): the port on which to start. The start port is the
            lowest port ERDOS will use to establish TCP connections between
            operators.
    """
    data_addresses = [
        "127.0.0.1:{port}".format(port=start_port + i)
        for i in range(_num_py_operators + 1)
    ]
    control_addresses = [
        "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i)
        for i in range(_num_py_operators + 1)
    ]

    def runner(node_id, data_addresses, control_addresses):
        _internal.run(node_id, data_addresses, control_addresses)

    processes = [
        mp.Process(target=runner, args=(i, data_addresses, control_addresses))
        for i in range(1, _num_py_operators + 1)
    ]

    for p in processes:
        p.start()

    # Needed to shut down child processes
    def sigint_handler(sig, frame):
        for p in processes:
            p.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)

    # The driver must always be on node 0 otherwise ingest and extract streams
    # will break
    _internal.run_async(0, data_addresses, control_addresses, graph_filename)

    for p in processes:
        p.join()
Beispiel #2
0
def run_async(graph_filename: Optional[str] = None,
              start_port: Optional[int] = 9000) -> NodeHandle:
    """Instantiates and runs the dataflow graph asynchronously.

    ERDOS will spawn 1 process for each python operator, and connect them via
    TCP.

    Args:
        graph_filename: The filename to which to write the dataflow graph
            as a DOT file.
        start_port: The port on which to start. The start port is the
            lowest port ERDOS will use to establish TCP connections between
            operators.

    Returns:
        A :py:class:`.NodeHandle` that allows the driver to interface with the
        dataflow graph.
    """
    data_addresses = [
        "127.0.0.1:{port}".format(port=start_port + i)
        for i in range(_num_py_operators + 1)
    ]
    control_addresses = [
        "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i)
        for i in range(_num_py_operators + 1)
    ]
    logger.debug(
        "Running the dataflow graph on addresses: {}".format(data_addresses))

    # Fix for macOS where mulitprocessing defaults
    # to spawn() instead of fork() in Python 3.8+
    # https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods
    # Warning: may lead to crashes
    # https://bugs.python.org/issue33725
    ctx = mp.get_context("fork")
    processes = [
        ctx.Process(target=_run_node,
                    args=(i, data_addresses, control_addresses))
        for i in range(1, _num_py_operators + 1)
    ]

    # Needed to shut down child processes
    def sigint_handler(sig, frame):
        for p in processes:
            p.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)

    for p in processes:
        p.start()

    # The driver must always be on node 0 otherwise ingest and extract streams
    # will break
    py_node_handle = _internal.run_async(0, data_addresses, control_addresses,
                                         graph_filename)

    return NodeHandle(py_node_handle, processes)
Beispiel #3
0
def run_async(driver, start_port=9000):
    """Instantiates and runs the dataflow graph asynchronously.

    ERDOS will spawn 1 process for each python operator, and connect them via
    TCP.

    Args:
        driver (function): function that builds the dataflow graph. This must
            be passed as a function so it can run on all ERDOS processes.
        start_port (int): the port on which to start. The start port is the
            lowest port ERDOS will use to establish TCP connections between
            operators.
    """
    results = driver()  # run driver to set _num_py_operators

    addresses = [
        "127.0.0.1:{port}".format(port=start_port + i)
        for i in range(_num_py_operators + 1)  # Add 1 for the driver
    ]

    def runner(driver, node_id, addresses):
        driver()
        _internal.run(node_id, addresses)

    processes = [
        mp.Process(target=runner,
                   args=(driver, i + 1,
                         addresses))  # Add 1 b/c driver is node 0
        for i in range(_num_py_operators)
    ]

    _internal.run_async(0, addresses)

    for p in processes:
        p.start()

    # Needed to shut down child processes
    def sigint_handler(sig, frame):
        for p in processes:
            p.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)

    return results
Beispiel #4
0
def run_async(graph_filename: Optional[str] = None,
              start_port: Optional[int] = 9000) -> NodeHandle:
    """Instantiates and runs the dataflow graph asynchronously.

    ERDOS will spawn 1 process for each python operator, and connect them via
    TCP.

    Args:
        graph_filename: The filename to which to write the dataflow graph
            as a DOT file.
        start_port: The port on which to start. The start port is the
            lowest port ERDOS will use to establish TCP connections between
            operators.

    Returns:
        A :py:class:`.NodeHandle` that allows the driver to interface with the
        dataflow graph.
    """
    data_addresses = [
        "127.0.0.1:{port}".format(port=start_port + i)
        for i in range(_num_py_operators + 1)
    ]
    control_addresses = [
        "127.0.0.1:{port}".format(port=start_port + len(data_addresses) + i)
        for i in range(_num_py_operators + 1)
    ]
    logger.debug(
        "Running the dataflow graph on addresses: {}".format(data_addresses))

    def runner(node_id, data_addresses, control_addresses):
        _internal.run(node_id, data_addresses, control_addresses)

    processes = [
        mp.Process(target=runner, args=(i, data_addresses, control_addresses))
        for i in range(1, _num_py_operators + 1)
    ]

    # Needed to shut down child processes
    def sigint_handler(sig, frame):
        for p in processes:
            p.terminate()
        sys.exit(0)

    signal.signal(signal.SIGINT, sigint_handler)

    for p in processes:
        p.start()

    # The driver must always be on node 0 otherwise ingest and extract streams
    # will break
    py_node_handle = _internal.run_async(0, data_addresses, control_addresses,
                                         graph_filename)

    return NodeHandle(py_node_handle, processes)