def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1): """ Launch the jobs in the job packing mode. :param launchpad: (LaunchPad) object :param fworker: (FWorker) object :param loglvl: (str) level at which to output logs :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param num_jobs: (int) number of sub jobs :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution) :param ppn: (int) processors per node (doesn't affect execution) """ # parse node file contents node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list) # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False, local_redirect=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node local_redirect (bool): redirect standard input and output to local file """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() firing_state_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict, local_redirect=local_redirect, firing_state_dict=firing_state_dict) FWData().Running_IDs = running_ids_dict FWData().FiringState = firing_state_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict) FWData().Running_IDs = running_ids_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()