def run_multiple_experiments_with_slurm(experiments, n_parallel=None, max_processes_per_node=None, raise_exceptions=True, run_args={}, slurm_kwargs={}): ''' Run multiple experiments using slurm, optionally in parallel. ''' if n_parallel and n_parallel > 1: # raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!") print ('Warning... parallel-slurm integration is very beta. Use with caution') experiment_subsets = divide_into_subsets(experiments, subset_size=n_parallel) for i, exp_subset in enumerate(experiment_subsets): nanny = Nanny() function_call = partial(run_multiple_experiments, experiments=exp_subset, parallel=n_parallel if max_processes_per_node is None else max_processes_per_node, display_results=False, run_args = run_args ) spp = SlurmPythonProcess(name="Group %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs) # Using Nanny only for convenient stdout & stderr forwarding. nanny.register_child_process(spp,monitor_for_termination=False) nanny.execute_all_child_processes(time_out=2) else: for i,exp in enumerate(experiments): nanny = Nanny() function_call = partial(run_experiment, experiment=exp, slurm_job=True, experiment_path=get_experiment_dir(), raise_exceptions=raise_exceptions,display_results=False, **run_args) spp = SlurmPythonProcess(name="Exp %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs) # Using Nanny only for convenient stdout & stderr forwarding. nanny.register_child_process(spp,monitor_for_termination=False) nanny.execute_all_child_processes(time_out=2)
def test_simple_process(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"success_function") pyc = PythonChildProcess(name="Process", ip_address=ip_address,command=command) nanny.register_child_process(pyc,) with captured_output() as (out,err): nanny.execute_all_child_processes() assert "%s: Success"%(pyc.get_name()) == out.getvalue().strip()
def test_several_simple_processes(N): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"success_function") for i in range(N): pyc = PythonChildProcess(name="Process%i"%i, ip_address=ip_address,command=command) nanny.register_child_process(pyc,) with captured_output() as (out,err): nanny.execute_all_child_processes() out_value = out.getvalue().strip() for pyc in nanny.managed_child_processes.values(): assert "%s: Success"%(pyc.get_name()) in out_value
def test_iter_print(): for ip_address in ip_addresses: nanny = Nanny() command = ["python","-u", get_test_functions_path(ip_address), "--callback=iter_print"] pyc = PythonChildProcess(name="P1",ip_address=ip_address,command=command) nanny.register_child_process(pyc) with captured_output() as (out, err): nanny.execute_all_child_processes(time_out=1) if pyc.is_local(): assert str(out.getvalue().strip()) == "\n".join(["P1: %i"%i for i in [0,2,4,6,8]]) assert str(err.getvalue().strip()) == "\n".join(["P1: %i"%i for i in [1,3,5,7,9]]) else: assert "\r\n".join(["P1: %i" % i for i in range(10)]) == str(out.getvalue().strip())
def test_process_termination(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_low") pyc = PythonChildProcess(name="Process1", ip_address=ip_address,command=command) nanny.register_child_process(pyc,) command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_high") pyc = PythonChildProcess(name="Process2", ip_address=ip_address,command=command) nanny.register_child_process(pyc,) with captured_output() as (out,err): nanny.execute_all_child_processes(time_out=1) check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now."%ip_address assert check_text in out.getvalue() or check_text in err.getvalue()
def test_output_monitor(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"short_sleep") pyc = PythonChildProcess(name="Process1", ip_address=ip_address,command=command) nanny.register_child_process(pyc,monitor_if_stuck_timeout=5) command = "python %s --callback=%s"%(get_test_functions_path(ip_address),"count_high") pyc = PythonChildProcess(name="Process2", ip_address=ip_address,command=command) nanny.register_child_process(pyc,monitor_if_stuck_timeout=3) with captured_output() as (out,err): nanny.execute_all_child_processes(time_out=1) check_text1 = "Timeout occurred after 0.1 min, process Process1 stuck" check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now."%ip_address assert check_text in out.getvalue() or check_text in err.getvalue() assert check_text1 in out.getvalue() or check_text1 in err.getvalue()
def test_process_termination(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "count_low") pyc = PythonChildProcess(name="Process1", ip_address=ip_address, command=command) nanny.register_child_process(pyc, ) command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "count_high") pyc = PythonChildProcess(name="Process2", ip_address=ip_address, command=command) nanny.register_child_process(pyc, ) with captured_output() as (out, err): nanny.execute_all_child_processes(time_out=1) check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now." % ip_address assert check_text in out.getvalue() or check_text in err.getvalue()
def test_simple_process(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "success_function") pyc = PythonChildProcess(name="Process", ip_address=ip_address, command=command) nanny.register_child_process(pyc, ) with captured_output() as (out, err): nanny.execute_all_child_processes() assert "%s: Success" % (pyc.get_name()) == out.getvalue().strip()
def test_output_monitor(): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "short_sleep") pyc = PythonChildProcess(name="Process1", ip_address=ip_address, command=command) nanny.register_child_process(pyc, monitor_if_stuck_timeout=5) command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "count_high") pyc = PythonChildProcess(name="Process2", ip_address=ip_address, command=command) nanny.register_child_process(pyc, monitor_if_stuck_timeout=3) with captured_output() as (out, err): nanny.execute_all_child_processes(time_out=1) check_text1 = "Timeout occurred after 0.1 min, process Process1 stuck" check_text = "Child Process Process2 at %s did not terminate 1 seconds after the first process in cluster terminated. Terminating now." % ip_address assert check_text in out.getvalue() or check_text in err.getvalue() assert check_text1 in out.getvalue() or check_text1 in err.getvalue()
def test_several_simple_processes(N): for ip_address in ip_addresses: nanny = Nanny() command = "python %s --callback=%s" % ( get_test_functions_path(ip_address), "success_function") for i in range(N): pyc = PythonChildProcess(name="Process%i" % i, ip_address=ip_address, command=command) nanny.register_child_process(pyc, ) with captured_output() as (out, err): nanny.execute_all_child_processes() out_value = out.getvalue().strip() for pyc in nanny.managed_child_processes.values(): assert "%s: Success" % (pyc.get_name()) in out_value
def run_multiple_experiments_with_slurm(experiments, n_parallel=None, raise_exceptions=True, run_args={}, slurm_kwargs={}): ''' Run multiple experiments using slurm, optionally in parallel. ''' if n_parallel and n_parallel > 1: raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!") else: for i,exp in enumerate(experiments): nanny = Nanny() func = run_experiment experiment_path = get_experiment_dir() function_call = partial(func, experiment=exp, slurm_job=True, experiment_path=experiment_path,raise_exceptions=raise_exceptions,display_results=False, **run_args) spp = SlurmPythonProcess(name="Exp %i"%i, function=function_call,ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs) # Using Nanny only for convenient stdout & stderr forwarding. nanny.register_child_process(spp,monitor_for_termination=False) nanny.execute_all_child_processes(time_out=2)
def test_iter_print(): for ip_address in ip_addresses: nanny = Nanny() command = [ "python", "-u", get_test_functions_path(ip_address), "--callback=iter_print" ] pyc = PythonChildProcess(name="P1", ip_address=ip_address, command=command) nanny.register_child_process(pyc) with captured_output() as (out, err): nanny.execute_all_child_processes(time_out=1) if pyc.is_local(): assert str(out.getvalue().strip()) == "\n".join( ["P1: %i" % i for i in [0, 2, 4, 6, 8]]) assert str(err.getvalue().strip()) == "\n".join( ["P1: %i" % i for i in [1, 3, 5, 7, 9]]) else: assert "\r\n".join(["P1: %i" % i for i in range(10) ]) == str(out.getvalue().strip())
def run_multiple_experiments_with_slurm(experiments, n_parallel=None, max_processes_per_node=None, raise_exceptions=True, run_args={}, slurm_kwargs={}): ''' Run multiple experiments using slurm, optionally in parallel. ''' if n_parallel and n_parallel > 1: # raise NotImplementedError("No parallel Slurm execution at the moment. Implement it!") print( 'Warning... parallel-slurm integration is very beta. Use with caution' ) experiment_subsets = divide_into_subsets(experiments, subset_size=n_parallel) for i, exp_subset in enumerate(experiment_subsets): nanny = Nanny() function_call = partial( run_multiple_experiments, experiments=exp_subset, parallel=n_parallel if max_processes_per_node is None else max_processes_per_node, display_results=False, run_args=run_args) spp = SlurmPythonProcess(name="Group %i" % i, function=function_call, ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs) # Using Nanny only for convenient stdout & stderr forwarding. nanny.register_child_process(spp, monitor_for_termination=False) nanny.execute_all_child_processes(time_out=2) else: for i, exp in enumerate(experiments): nanny = Nanny() function_call = partial(run_experiment, experiment=exp, slurm_job=True, experiment_path=get_experiment_dir(), raise_exceptions=raise_exceptions, display_results=False, **run_args) spp = SlurmPythonProcess(name="Exp %i" % i, function=function_call, ip_address="127.0.0.1", slurm_kwargs=slurm_kwargs) # Using Nanny only for convenient stdout & stderr forwarding. nanny.register_child_process(spp, monitor_for_termination=False) nanny.execute_all_child_processes(time_out=2)
def set_up_plotting_server(): """ Sets up the plotting server. """ print("Setting up Plotting Server") # First we generate the system call that starts the server # TODO: This assumes the same installation path relative to the home-dir on the local machine as on the remote machine file_to_execute = os.path.join(os.path.dirname(__file__), 'plotting_server.py') file_to_execute = file_to_execute.replace(os.path.expanduser("~"),"~",1) plotting_server_address = get_plotting_server_address() if plotting_server_address == "": plotting_server_address = "127.0.0.1" if plotting_server_address in get_local_ips(): command = ["python", "-u", file_to_execute] else: check_config_file(plotting_server_address) # Make sure all things are set check_ssh_connection(plotting_server_address) # Make sure the SSH-connection works command =["export DISPLAY=:0.0;", "python","-u", file_to_execute] # TODO: Setting DISPLAY to :0.0 is a heuristic at the moment. I don't understand yet how these DISPLAY variables are set. # With the command set up, we can instantiate a child process and start it. Also we want to forward stdout and stderr from the remote process asynchronously. global _nanny _nanny = Nanny() cp = PythonChildProcess(ip_address=plotting_server_address, command=command, name="Plotting_Server",set_up_port_for_structured_back_communication=True) _nanny.register_child_process(cp,monitor_for_termination=False,monitor_if_stuck_timeout=None,) _nanny.execute_all_child_processes(blocking=False) back_comm_queue = cp.get_queue_from_cp() try: is_debug_mode = getattr(sys, 'gettrace', None) timeout = None if is_debug_mode() else 10 server_message = back_comm_queue.get(block=True,timeout=timeout) except Queue.Empty: print("The Plotting Server did not respond for 10 seconds. It probably crashed") sys.exit(1) try: port = int(server_message.dbplot_message) except ValueError: print("There was an incorrect string on the remote server's stdout. Make sure the server first communicates a port number. Received:\n {}".format(str_port)) sys.exit(0) # In the remote setting we don't want to rely on the user correctly specifying their firewalls. Therefore we need to set up port forwarding through ssh: # Also, we have the ssh session open already, so why not reuse it. if plotting_server_address not in get_local_ips(): ssh_conn = cp.get_ssh_connection() # Needs to be in a thread since the call blocks forever. # Todo: this ssh tunnel is opened system-wide. That means that any subsequent attempts to open the ssh-tunnel (by another dbplot-using process, for example) # will perform wiredly. As far as I have tested, nothing happenes and the port forwarfding works just fine in the second process, However when one of the two # processes terminates, the ssh-tunnel is closed for the other process as well. t3 = threading.Thread(target = forward_tunnel, kwargs={"local_port":port, "remote_host":plotting_server_address, "remote_port":port,"ssh_conn":ssh_conn}) t3.setDaemon(True) t3.start() # Now attempt to connect to the plotting server server_address = ("localhost", port) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect(tuple(server_address)) except: raise # Once connected, set up the asynchronous threads that forward every dbplot call to the server. We make this asynchronously for two reasons: # 1.) the queues involved can be shared between different threads (not processes), therefore allowing for asynchronous dbplot calls that are both correctly forwarded. # 2.) sending a plot away to the server now immediatly returns, independent on any socket-related communication delays that might exist. # (There shouldn't be any, but, you know, principle) global _to_subprocess_queue global _id_queue _to_subprocess_queue = Queue.Queue() _id_queue = Queue.Queue() t1 = threading.Thread(target=push_to_server, args=(_to_subprocess_queue, sock)) t1.setDaemon(True) t1.start() # if blocking: t2 = threading.Thread(target=collect_from_server, args=(_id_queue, sock)) t2.setDaemon(True) t2.start()
def set_up_plotting_server(): """ Sets up the plotting server. """ print("Setting up Plotting Server") # First we generate the system call that starts the server # TODO: This assumes the same installation path relative to the home-dir on the local machine as on the remote machine file_to_execute = os.path.join(os.path.dirname(__file__), 'plotting_server.py') file_to_execute = file_to_execute.replace(os.path.expanduser("~"),"~",1) plotting_server_address = get_plotting_server_address() if plotting_server_address == "": plotting_server_address = "127.0.0.1" if plotting_server_address in get_local_ips(): command = ["python", "-u", file_to_execute] else: check_config_file(plotting_server_address) # Make sure all things are set check_ssh_connection(plotting_server_address) # Make sure the SSH-connection works command =["export DISPLAY=:0.0;", "python","-u", file_to_execute] # TODO: Setting DISPLAY to :0.0 is a heuristic at the moment. I don't understand yet how these DISPLAY variables are set. # With the command set up, we can instantiate a child process and start it. Also we want to forward stdout and stderr from the remote process asynchronously. global _nanny _nanny = Nanny() cp = PythonChildProcess(ip_address=plotting_server_address, command=command, name="Plotting_Server",set_up_port_for_structured_back_communication=True) _nanny.register_child_process(cp,monitor_for_termination=False,monitor_if_stuck_timeout=None,) _nanny.execute_all_child_processes(blocking=False) back_comm_queue = cp.get_queue_from_cp() try: is_debug_mode = getattr(sys, 'gettrace', None) timeout = None if is_debug_mode() else 10 server_message = back_comm_queue.get(block=True,timeout=timeout) except Queue.Empty: print("The Plotting Server did not respond for 10 seconds. It probably crashed") sys.exit(1) try: port = int(server_message.dbplot_message) except ValueError: print("There was an incorrect string on the remote server's stdout. Make sure the server first communicates a port number. Received:\n {}".format(server_message.dbplot_message)) sys.exit(0) # In the remote setting we don't want to rely on the user correctly specifying their firewalls. Therefore we need to set up port forwarding through ssh: # Also, we have the ssh session open already, so why not reuse it. if plotting_server_address not in get_local_ips(): ssh_conn = cp.get_ssh_connection() # Needs to be in a thread since the call blocks forever. # Todo: this ssh tunnel is opened system-wide. That means that any subsequent attempts to open the ssh-tunnel (by another dbplot-using process, for example) # will perform wiredly. As far as I have tested, nothing happenes and the port forwarfding works just fine in the second process, However when one of the two # processes terminates, the ssh-tunnel is closed for the other process as well. t3 = threading.Thread(target = forward_tunnel, kwargs={"local_port":port, "remote_host":plotting_server_address, "remote_port":port,"ssh_conn":ssh_conn}) t3.setDaemon(True) t3.start() # Now attempt to connect to the plotting server server_address = ("localhost", port) sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect(tuple(server_address)) except: raise # Once connected, set up the asynchronous threads that forward every dbplot call to the server. We make this asynchronously for two reasons: # 1.) the queues involved can be shared between different threads (not processes), therefore allowing for asynchronous dbplot calls that are both correctly forwarded. # 2.) sending a plot away to the server now immediatly returns, independent on any socket-related communication delays that might exist. # (There shouldn't be any, but, you know, principle) global _to_subprocess_queue global _id_queue _to_subprocess_queue = Queue.Queue() _id_queue = Queue.Queue() t1 = threading.Thread(target=push_to_server, args=(_to_subprocess_queue, sock)) t1.setDaemon(True) t1.start() # if blocking: t2 = threading.Thread(target=collect_from_server, args=(_id_queue, sock)) t2.setDaemon(True) t2.start()