def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches Args: port (int): Listening port number of the DataServer stop_event (Thread.Event): stop event """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() fd = FWData() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in fd.Running_IDs.items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: fd.Running_IDs[pid] = None fd.FiringState[pid] = False pass # means this process is dead! stop_event.wait(PING_TIME_SECS)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( sleep_time)) time.sleep(sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(sleep_time)) rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout) else: break log_multi(l_logger, 'Sub job finished')
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout): """ Initializes shared data with multiprocessing parameters and starts a rapidfire :param fworker: (FWorker) object :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param sleep: (int) secs to sleep between rapidfire loop iterations :param loglvl: (str) level at which to output logs to stdout :param port: (int) Listening port number of the shared object manage :param password: (str) security password to access the server :param node_list: ([str]) computer node list :param sub_nproc: (int) number of processors of the sub job :param timeout: (int) # of seconds after which to stop the rapidfire process """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process """ ds = DataServer(address=("127.0.0.1", port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc FWData().Running_IDs = running_ids_dict sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger("rocket.launcher", l_dir=l_dir, stream_level=loglvl) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized launch_ids = FWData().Running_IDs.values() live_ids = list(set(launch_ids) - {None}) if len(live_ids) > 0: # Some other sub jobs are still running log_multi(l_logger, "Sleeping for {} secs before resubmit sub job".format(sleep_time)) time.sleep(sleep_time) log_multi(l_logger, "Resubmit sub job".format(sleep_time)) rapidfire( launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, ) else: break log_multi(l_logger, "Sub job finished")
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [pid for pid, is_firing in fw_data.FiringState.items() if is_firing] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi(l_logger, 'Sleeping for {} secs before resubmit sub job'.format(macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1): """ Launch the jobs in the job packing mode. :param launchpad: (LaunchPad) object :param fworker: (FWorker) object :param loglvl: (str) level at which to output logs :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param num_jobs: (int) number of sub jobs :param sleep_time: (int) secs to sleep between rapidfire loop iterations :param total_node_list: ([str]) contents of NODEFILE (doesn't affect execution) :param ppn: (int) processors per node (doesn't affect execution) """ # parse node file contents node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list) # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False, local_redirect=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node local_redirect (bool): redirect standard input and output to local file """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() firing_state_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict, local_redirect=local_redirect, firing_state_dict=firing_state_dict) FWData().Running_IDs = running_ids_dict FWData().FiringState = firing_state_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc): """ Initializes shared data with multiprocessing parameters and starts a rapidfire :param fworker: (FWorker) object :param nlaunches: (int) 0 means 'until completion', -1 or "infinite" means to loop forever :param sleep: (int) secs to sleep between rapidfire loop iterations :param loglvl: (str) level at which to output logs to stdout :param port: (int) Listening port number of the shared object manage :param password: (str) security password to access the server :param node_list: ([str]) computer node list :param sub_nproc: (int) number of processors of the sub job """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() FWData().DATASERVER = ds FWData().MULTIPROCESSING = True FWData().NODE_LIST = node_list FWData().SUB_NPROCS = sub_nproc rapidfire(launchpad, fworker, None, nlaunches, -1, sleep, loglvl)
def launch_multiprocess(launchpad, fworker, loglvl, nlaunches, num_jobs, sleep_time, total_node_list=None, ppn=1, timeout=None, exclude_current_node=False): """ Launch the jobs in the job packing mode. Args: launchpad (LaunchPad) fworker (FWorker) loglvl (str): level at which to output logs nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever num_jobs(int): number of sub jobs sleep_time (int): secs to sleep between rapidfire loop iterations total_node_list ([str]): contents of NODEFILE (doesn't affect execution) ppn (int): processors per node (doesn't affect execution) timeout (int): # of seconds after which to stop the rapidfire process exclude_current_node: Don't use the script launching node as a compute node """ # parse node file contents if exclude_current_node: host = get_my_host() l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) if host in total_node_list: log_multi(l_logger, "Remove the current node \"{}\" from compute node".format(host)) total_node_list.remove(host) else: log_multi(l_logger, "The current node is not in the node list, keep the node list as is") node_lists, sub_nproc_list = split_node_lists(num_jobs, total_node_list, ppn) # create shared dataserver ds = DataServer.setup(launchpad) port = ds.address[1] manager = Manager() running_ids_dict = manager.dict() # launch rapidfire processes processes = start_rockets(fworker, nlaunches, sleep_time, loglvl, port, node_lists, sub_nproc_list, timeout=timeout, running_ids_dict=running_ids_dict) FWData().Running_IDs = running_ids_dict # start pinging service ping_stop = threading.Event() ping_thread = threading.Thread(target=ping_multilaunch, args=(port, ping_stop)) ping_thread.start() # wait for completion for p in processes: p.join() ping_stop.set() ping_thread.join() ds.shutdown()
def ping_multilaunch(port, stop_event): """ A single manager to ping all launches during multiprocess launches :param port: (int) Listening port number of the DataServer :param stop_event: (Thread.Event) stop event """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() lp = ds.LaunchPad() while not stop_event.is_set(): for pid, lid in ds.Running_IDs().items(): if lid: try: os.kill(pid, 0) # throws OSError if the process is dead lp.ping_launch(lid) except OSError: pass # means this process is dead! stop_event.wait(PING_TIME_SECS)
def rapidfire_process(fworker, nlaunches, sleep, loglvl, port, node_list, sub_nproc, timeout, running_ids_dict, local_redirect, firing_state_dict, macro_sleep_time=None): """ Initializes shared data with multiprocessing parameters and starts a rapidfire. Args: fworker (FWorker): object nlaunches (int): 0 means 'until completion', -1 or "infinite" means to loop forever sleep (int): secs to sleep between rapidfire loop iterations loglvl (str): level at which to output logs to stdout port (int): Listening port number of the shared object manage password (str): security password to access the server node_list ([str]): computer node list sub_nproc (int): number of processors of the sub job timeout (int): # of seconds after which to stop the rapidfire process macro_sleep_time (int): secs to sleep between sub job resubmit local_redirect (bool): redirect standard input and output to local file """ ds = DataServer(address=('127.0.0.1', port), authkey=DS_PASSWORD) ds.connect() launchpad = ds.LaunchPad() fw_data = FWData() fw_data.DATASERVER = ds fw_data.MULTIPROCESSING = True fw_data.NODE_LIST = node_list fw_data.SUB_NPROCS = sub_nproc fw_data.Running_IDs = running_ids_dict fw_data.FiringState = firing_state_dict fw_data.lp = launchpad sleep_time = sleep if sleep else RAPIDFIRE_SLEEP_SECS l_dir = launchpad.get_logdir() if launchpad else None l_logger = get_fw_logger('rocket.launcher', l_dir=l_dir, stream_level=loglvl) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False while nlaunches == 0: time.sleep(1.5) # wait for LaunchPad to be initialized firing_pids = [ pid for pid, is_firing in fw_data.FiringState.items() if is_firing ] if len(firing_pids) > 0: # Some other sub jobs are still running macro_sleep_time = macro_sleep_time if macro_sleep_time \ else sleep_time * len(fw_data.FiringState) log_multi( l_logger, 'Sleeping for {} secs before resubmit sub job'.format( macro_sleep_time)) time.sleep(macro_sleep_time) log_multi(l_logger, 'Resubmit sub job'.format(macro_sleep_time)) fw_data.FiringState[os.getpid()] = True rapidfire(launchpad, fworker=fworker, m_dir=None, nlaunches=nlaunches, max_loops=-1, sleep_time=sleep, strm_lvl=loglvl, timeout=timeout, local_redirect=local_redirect) fw_data.FiringState[os.getpid()] = False else: break log_multi(l_logger, 'Sub job finished')