def run_multiple_experiments_with_slurm(experiments, n_parallel=None, max_processes_per_node=None, raise_exceptions=True, run_args={}, slurm_kwargs={}):
    '''
    Run multiple experiments using slurm, optionally in parallel.
    '''
    if n_parallel and n_parallel > 1:
        # raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!")
        print ('Warning... parallel-slurm integration is very beta. Use with caution')
        experiment_subsets = divide_into_subsets(experiments, subset_size=n_parallel)
        for i, exp_subset in enumerate(experiment_subsets):
            nanny = Nanny()
            function_call = partial(run_multiple_experiments,
                experiments=exp_subset,
                parallel=n_parallel if max_processes_per_node is None else max_processes_per_node,
                display_results=False,
                run_args = run_args
                )
            spp = SlurmPythonProcess(name="Group %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs)
            # Using Nanny only for convenient stdout & stderr forwarding.
            nanny.register_child_process(spp,monitor_for_termination=False)
            nanny.execute_all_child_processes(time_out=2)
    else:
        for i,exp in enumerate(experiments):
            nanny = Nanny()
            function_call = partial(run_experiment, experiment=exp, slurm_job=True, experiment_path=get_experiment_dir(),
                raise_exceptions=raise_exceptions,display_results=False, **run_args)
            spp = SlurmPythonProcess(name="Exp %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs)
            # Using Nanny only for convenient stdout & stderr forwarding.
            nanny.register_child_process(spp,monitor_for_termination=False)
            nanny.execute_all_child_processes(time_out=2)
Exemple #2
0
def test_simple_process():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"success_function")
        pyc = PythonChildProcess(name="Process", ip_address=ip_address,command=command)
        nanny.register_child_process(pyc,)
        with captured_output() as (out,err):
            nanny.execute_all_child_processes()
        assert "%s: Success"%(pyc.get_name()) == out.getvalue().strip()
Exemple #3
0
def test_several_simple_processes(N):
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"success_function")
        for i in range(N):
            pyc = PythonChildProcess(name="Process%i"%i, ip_address=ip_address,command=command)
            nanny.register_child_process(pyc,)
        with captured_output() as (out,err):
            nanny.execute_all_child_processes()
        out_value = out.getvalue().strip()
        for pyc in nanny.managed_child_processes.values():
            assert "%s: Success"%(pyc.get_name()) in out_value
Exemple #4
0
def test_iter_print():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = ["python","-u", get_test_functions_path(ip_address), "--callback=iter_print"]
        pyc = PythonChildProcess(name="P1",ip_address=ip_address,command=command)
        nanny.register_child_process(pyc)
        with captured_output() as (out, err):
            nanny.execute_all_child_processes(time_out=1)
        if pyc.is_local():
            assert str(out.getvalue().strip()) == "\n".join(["P1: %i"%i for i in [0,2,4,6,8]])
            assert str(err.getvalue().strip()) == "\n".join(["P1: %i"%i for i in [1,3,5,7,9]])
        else:
            assert "\r\n".join(["P1: %i" % i for i in range(10)]) == str(out.getvalue().strip())
Exemple #5
0
def test_process_termination():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_low")
        pyc = PythonChildProcess(name="Process1", ip_address=ip_address,command=command)
        nanny.register_child_process(pyc,)
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_high")
        pyc = PythonChildProcess(name="Process2", ip_address=ip_address,command=command)
        nanny.register_child_process(pyc,)
        with captured_output() as (out,err):
            nanny.execute_all_child_processes(time_out=1)
        check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now."%ip_address
        assert check_text in out.getvalue() or check_text in err.getvalue()
Exemple #6
0
def test_output_monitor():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"short_sleep")
        pyc = PythonChildProcess(name="Process1", ip_address=ip_address,command=command)
        nanny.register_child_process(pyc,monitor_if_stuck_timeout=5)
        command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_high")
        pyc = PythonChildProcess(name="Process2", ip_address=ip_address,command=command)
        nanny.register_child_process(pyc,monitor_if_stuck_timeout=3)
        with captured_output() as (out,err):
            nanny.execute_all_child_processes(time_out=1)
        check_text1 = "Timeout occurred after 0.1 min, process Process1 stuck"
        check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now."%ip_address
        assert check_text in out.getvalue() or check_text in err.getvalue()
        assert check_text1 in out.getvalue() or check_text1 in err.getvalue()
Exemple #7
0
def test_process_termination():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "count_low")
        pyc = PythonChildProcess(name="Process1",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc, )
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "count_high")
        pyc = PythonChildProcess(name="Process2",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc, )
        with captured_output() as (out, err):
            nanny.execute_all_child_processes(time_out=1)
        check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now." % ip_address
        assert check_text in out.getvalue() or check_text in err.getvalue()
Exemple #8
0
def test_simple_process():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "success_function")
        pyc = PythonChildProcess(name="Process",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc, )
        with captured_output() as (out, err):
            nanny.execute_all_child_processes()
        assert "%s: Success" % (pyc.get_name()) == out.getvalue().strip()
Exemple #9
0
def test_output_monitor():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "short_sleep")
        pyc = PythonChildProcess(name="Process1",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc, monitor_if_stuck_timeout=5)
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "count_high")
        pyc = PythonChildProcess(name="Process2",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc, monitor_if_stuck_timeout=3)
        with captured_output() as (out, err):
            nanny.execute_all_child_processes(time_out=1)
        check_text1 = "Timeout occurred after 0.1 min, process Process1 stuck"
        check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now." % ip_address
        assert check_text in out.getvalue() or check_text in err.getvalue()
        assert check_text1 in out.getvalue() or check_text1 in err.getvalue()
Exemple #10
0
def test_several_simple_processes(N):
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = "python %s --callback=%s" % (
            get_test_functions_path(ip_address), "success_function")
        for i in range(N):
            pyc = PythonChildProcess(name="Process%i" % i,
                                     ip_address=ip_address,
                                     command=command)
            nanny.register_child_process(pyc, )
        with captured_output() as (out, err):
            nanny.execute_all_child_processes()
        out_value = out.getvalue().strip()
        for pyc in nanny.managed_child_processes.values():
            assert "%s: Success" % (pyc.get_name()) in out_value
Exemple #11
0
def run_multiple_experiments_with_slurm(experiments, n_parallel=None, raise_exceptions=True, run_args={}, slurm_kwargs={}):
    '''
    Run multiple experiments using slurm, optionally in parallel.
    '''
    if n_parallel and n_parallel > 1:
        raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!")
    else:
        for i,exp in enumerate(experiments):
            nanny = Nanny()
            func = run_experiment
            experiment_path = get_experiment_dir()
            function_call = partial(func, experiment=exp, slurm_job=True, experiment_path=experiment_path,raise_exceptions=raise_exceptions,display_results=False, **run_args)
            spp = SlurmPythonProcess(name="Exp %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs)
            # Using Nanny only for convenient stdout & stderr forwarding.
            nanny.register_child_process(spp,monitor_for_termination=False)
            nanny.execute_all_child_processes(time_out=2)
Exemple #12
0
def test_iter_print():
    for ip_address in ip_addresses:
        nanny = Nanny()
        command = [
            "python", "-u",
            get_test_functions_path(ip_address), "--callback=iter_print"
        ]
        pyc = PythonChildProcess(name="P1",
                                 ip_address=ip_address,
                                 command=command)
        nanny.register_child_process(pyc)
        with captured_output() as (out, err):
            nanny.execute_all_child_processes(time_out=1)
        if pyc.is_local():
            assert str(out.getvalue().strip()) == "\n".join(
                ["P1: %i" % i for i in [0, 2, 4, 6, 8]])
            assert str(err.getvalue().strip()) == "\n".join(
                ["P1: %i" % i for i in [1, 3, 5, 7, 9]])
        else:
            assert "\r\n".join(["P1: %i" % i for i in range(10)
                                ]) == str(out.getvalue().strip())
def run_multiple_experiments_with_slurm(experiments,
                                        n_parallel=None,
                                        max_processes_per_node=None,
                                        raise_exceptions=True,
                                        run_args={},
                                        slurm_kwargs={}):
    '''
    Run multiple experiments using slurm, optionally in parallel.
    '''
    if n_parallel and n_parallel > 1:
        # raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!")
        print(
            'Warning... parallel-slurm integration is very beta. Use with caution'
        )
        experiment_subsets = divide_into_subsets(experiments,
                                                 subset_size=n_parallel)
        for i, exp_subset in enumerate(experiment_subsets):
            nanny = Nanny()
            function_call = partial(
                run_multiple_experiments,
                experiments=exp_subset,
                parallel=n_parallel
                if max_processes_per_node is None else max_processes_per_node,
                display_results=False,
                run_args=run_args)
            spp = SlurmPythonProcess(name="Group %i" % i,
                                     function=function_call,
                                     ip_address="127.0.0.1",
                                     slurm_kwargs=slurm_kwargs)
            # Using Nanny only for convenient stdout & stderr forwarding.
            nanny.register_child_process(spp, monitor_for_termination=False)
            nanny.execute_all_child_processes(time_out=2)
    else:
        for i, exp in enumerate(experiments):
            nanny = Nanny()
            function_call = partial(run_experiment,
                                    experiment=exp,
                                    slurm_job=True,
                                    experiment_path=get_experiment_dir(),
                                    raise_exceptions=raise_exceptions,
                                    display_results=False,
                                    **run_args)
            spp = SlurmPythonProcess(name="Exp %i" % i,
                                     function=function_call,
                                     ip_address="127.0.0.1",
                                     slurm_kwargs=slurm_kwargs)
            # Using Nanny only for convenient stdout & stderr forwarding.
            nanny.register_child_process(spp, monitor_for_termination=False)
            nanny.execute_all_child_processes(time_out=2)
Exemple #14
0
def set_up_plotting_server():
    """
    Sets up the plotting server.
    """

    print("Setting up Plotting Server")

    # First we generate the system call that starts the server
    # TODO: This assumes the same installation path relative to the home-dir on the local machine as on the remote machine
    file_to_execute = os.path.join(os.path.dirname(__file__), 'plotting_server.py')
    file_to_execute = file_to_execute.replace(os.path.expanduser("~"),"~",1)
    plotting_server_address = get_plotting_server_address()
    if plotting_server_address == "":
        plotting_server_address = "127.0.0.1"
    if plotting_server_address in get_local_ips():
        command = ["python", "-u", file_to_execute]
    else:
        check_config_file(plotting_server_address) # Make sure all things are set
        check_ssh_connection(plotting_server_address) # Make sure the SSH-connection works
        command =["export DISPLAY=:0.0;", "python","-u", file_to_execute]
        # TODO: Setting DISPLAY to :0.0 is a heuristic at the moment. I don't understand yet how these DISPLAY variables are set.

    # With the command set up, we can instantiate a child process and start it. Also we want to forward stdout and stderr from the remote process asynchronously.
    global _nanny
    _nanny = Nanny()
    cp = PythonChildProcess(ip_address=plotting_server_address, command=command, name="Plotting_Server",set_up_port_for_structured_back_communication=True)
    _nanny.register_child_process(cp,monitor_for_termination=False,monitor_if_stuck_timeout=None,)
    _nanny.execute_all_child_processes(blocking=False)
    back_comm_queue = cp.get_queue_from_cp()
    try:
        is_debug_mode = getattr(sys, 'gettrace', None)
        timeout = None if is_debug_mode() else 10
        server_message = back_comm_queue.get(block=True,timeout=timeout)
    except Queue.Empty:
        print("The Plotting Server did not respond for 10 seconds. It probably crashed")
        sys.exit(1)

    try:
        port = int(server_message.dbplot_message)
    except ValueError:
        print("There was an incorrect string on the remote server's stdout. Make sure the server first communicates a port number. Received:\n {}".format(str_port))
        sys.exit(0)

    # In the remote setting we don't want to rely on the user correctly specifying their firewalls. Therefore we need to set up port forwarding through ssh:
    # Also, we have the ssh session open already, so why not reuse it.
    if plotting_server_address not in get_local_ips():
        ssh_conn = cp.get_ssh_connection()
        # Needs to be in a thread since the call blocks forever.
        # Todo: this ssh tunnel is opened system-wide. That means that any subsequent attempts to open the ssh-tunnel (by another dbplot-using process, for example)
        # will perform wiredly. As far as I have tested, nothing happenes and the port forwarfding works just fine in the second process, However when one of the two
        # processes terminates, the ssh-tunnel is closed for the other process as well.
        t3 = threading.Thread(target = forward_tunnel, kwargs={"local_port":port, "remote_host":plotting_server_address, "remote_port":port,"ssh_conn":ssh_conn})
        t3.setDaemon(True)
        t3.start()

    # Now attempt to connect to the plotting server
    server_address = ("localhost", port)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.connect(tuple(server_address))
    except:
        raise

    # Once connected, set up the asynchronous threads that forward every dbplot call to the server. We make this asynchronously for two reasons:
    # 1.) the queues involved can be shared between different threads (not processes), therefore allowing for asynchronous dbplot calls that are both correctly forwarded.
    # 2.) sending a plot away to the server now immediatly returns, independent on any socket-related communication delays that might exist.
    # (There shouldn't be any, but, you know, principle)
    global _to_subprocess_queue
    global _id_queue
    _to_subprocess_queue = Queue.Queue()
    _id_queue = Queue.Queue()
    t1 = threading.Thread(target=push_to_server, args=(_to_subprocess_queue, sock))
    t1.setDaemon(True)
    t1.start()
    # if blocking:
    t2 = threading.Thread(target=collect_from_server, args=(_id_queue, sock))
    t2.setDaemon(True)
    t2.start()
Exemple #15
0
def set_up_plotting_server():
    """
    Sets up the plotting server.
    """

    print("Setting up Plotting Server")

    # First we generate the system call that starts the server
    # TODO: This assumes the same installation path relative to the home-dir on the local machine as on the remote machine
    file_to_execute = os.path.join(os.path.dirname(__file__), 'plotting_server.py')
    file_to_execute = file_to_execute.replace(os.path.expanduser("~"),"~",1)
    plotting_server_address = get_plotting_server_address()
    if plotting_server_address == "":
        plotting_server_address = "127.0.0.1"
    if plotting_server_address in get_local_ips():
        command = ["python", "-u", file_to_execute]
    else:
        check_config_file(plotting_server_address) # Make sure all things are set
        check_ssh_connection(plotting_server_address) # Make sure the SSH-connection works
        command =["export DISPLAY=:0.0;", "python","-u", file_to_execute]
        # TODO: Setting DISPLAY to :0.0 is a heuristic at the moment. I don't understand yet how these DISPLAY variables are set.

    # With the command set up, we can instantiate a child process and start it. Also we want to forward stdout and stderr from the remote process asynchronously.
    global _nanny
    _nanny = Nanny()
    cp = PythonChildProcess(ip_address=plotting_server_address, command=command, name="Plotting_Server",set_up_port_for_structured_back_communication=True)
    _nanny.register_child_process(cp,monitor_for_termination=False,monitor_if_stuck_timeout=None,)
    _nanny.execute_all_child_processes(blocking=False)
    back_comm_queue = cp.get_queue_from_cp()
    try:
        is_debug_mode = getattr(sys, 'gettrace', None)
        timeout = None if is_debug_mode() else 10
        server_message = back_comm_queue.get(block=True,timeout=timeout)
    except Queue.Empty:
        print("The Plotting Server did not respond for 10 seconds. It probably crashed")
        sys.exit(1)

    try:
        port = int(server_message.dbplot_message)
    except ValueError:
        print("There was an incorrect string on the remote server's stdout. Make sure the server first communicates a port number. Received:\n {}".format(server_message.dbplot_message))
        sys.exit(0)

    # In the remote setting we don't want to rely on the user correctly specifying their firewalls. Therefore we need to set up port forwarding through ssh:
    # Also, we have the ssh session open already, so why not reuse it.
    if plotting_server_address not in get_local_ips():
        ssh_conn = cp.get_ssh_connection()
        # Needs to be in a thread since the call blocks forever.
        # Todo: this ssh tunnel is opened system-wide. That means that any subsequent attempts to open the ssh-tunnel (by another dbplot-using process, for example)
        # will perform wiredly. As far as I have tested, nothing happenes and the port forwarfding works just fine in the second process, However when one of the two
        # processes terminates, the ssh-tunnel is closed for the other process as well.
        t3 = threading.Thread(target = forward_tunnel, kwargs={"local_port":port, "remote_host":plotting_server_address, "remote_port":port,"ssh_conn":ssh_conn})
        t3.setDaemon(True)
        t3.start()

    # Now attempt to connect to the plotting server
    server_address = ("localhost", port)
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.connect(tuple(server_address))
    except:
        raise

    # Once connected, set up the asynchronous threads that forward every dbplot call to the server. We make this asynchronously for two reasons:
    # 1.) the queues involved can be shared between different threads (not processes), therefore allowing for asynchronous dbplot calls that are both correctly forwarded.
    # 2.) sending a plot away to the server now immediatly returns, independent on any socket-related communication delays that might exist.
    # (There shouldn't be any, but, you know, principle)
    global _to_subprocess_queue
    global _id_queue
    _to_subprocess_queue = Queue.Queue()
    _id_queue = Queue.Queue()
    t1 = threading.Thread(target=push_to_server, args=(_to_subprocess_queue, sock))
    t1.setDaemon(True)
    t1.start()
    # if blocking:
    t2 = threading.Thread(target=collect_from_server, args=(_id_queue, sock))
    t2.setDaemon(True)
    t2.start()