def status(simfolder): """ Check status of simulation. If on PBS scheduling, show status of nodes. Then, ask the fjd-dispatcher about status of jobs. :param string simfolder: relative path to simfolder :returns: True if successful, False otherwise """ scheduler = utils.get_scheduler(simfolder) sim_name = utils.get_simulation_name(simfolder, "{}/stosim.conf".format(simfolder)) if scheduler == 'pbs': num_nodes = len([n for n in os.listdir('{}/jobs'.format(simfolder))\ if n.endswith('.pbs')]) if num_nodes > 0: print("[StoSim] State of our {} PBS computing nodes:".format(num_nodes)) subprocess.call('echo "Waiting: $(qselect -u $USER -s W | wc -l)"', shell=True) subprocess.call('echo "Queued: $(qselect -u $USER -s Q | wc -l)"', shell=True) subprocess.call('echo "Running: $(qselect -u $USER -s R | wc -l)"', shell=True) else: print("[StoSim] No PBS computing nodes seem to be configured...") print("[StoSim] State of workers and jobs:") subprocess.call('fjd-dispatcher --project {} --status_only --interval {}'\ .format(sim_name, utils.get_interval(simfolder)), shell=True) return True
def resume(simfolder): """ (Re)start dispatching jobs :param string simfolder: relative path to simfolder :returns: True if successful, False otherwise """ sim_name = utils.get_simulation_name(simfolder, "{}/stosim.conf".format(simfolder)) subprocess.call('fjd-dispatcher --project {} --interval {}'\ .format(sim_name, utils.get_interval(simfolder)), shell=True) return True
def run(simfolder): ''' The main function to start running simulations :param string simfolder: relative path to simfolder :returns: True if successful, False otherwise ''' print('*' * 80) sim_name = utils.get_simulation_name(simfolder, "{}/stosim.conf".format(simfolder)) print("Running simulation {}".format(sim_name)) print('*' * 80) print('') if not osp.exists("%s/stosim.conf" % simfolder): print("[StoSim] %s/stosim.conf does not exist!" % simfolder) utils.usage() return False # prepare all jobs to be run by FJD fjd_dir = fjd.utils.ensure_wdir(sim_name) fjd.utils.empty_queues(sim_name) for job in [j for j in os.listdir("{}/jobs".format(simfolder))\ if j.endswith('.conf')]: copy("{}/jobs/{}".format(simfolder, job), "{}/jobqueue".format(fjd_dir)) dispatch_cmd = 'fjd-dispatcher --project {} --end_when_jobs_are_done '\ ' --callback "stosim --kill" --interval {}'\ .format(sim_name, utils.get_interval(simfolder)) # now decide if recruiting is done in a local network or on a PBS cluster scheduler = utils.get_scheduler(simfolder) if scheduler == 'fjd': # let FJD handle it in local network (default: only local PC) if os.path.exists('{}/remote.conf'.format(simfolder)): copy('{}/remote.conf'.format(simfolder), fjd_dir) subprocess.call('fjd-recruiter --project {} hire'.format(sim_name), shell=True) if not we_exited[0]: subprocess.call(dispatch_cmd, shell=True) # when recruiter got remote.conf, clean up in fjd dir if os.path.exists('{}/remote.conf'.format(simfolder)): os.remove('{}/remote.conf'.format(fjd_dir)) elif scheduler == 'pbs': # queue the PBS jobs we created on a PBS job scheduler (e.g. clusters # running Torque or PBS Pro). These simply start FJD workers. for job in [j for j in os.listdir('{}/jobs'.format(simfolder)) if j.endswith('.pbs')]: subprocess.call('qsub {}'.format('{}/jobs/{}'.format(simfolder, job)), shell=True) # Now we start dispatching subprocess.call(dispatch_cmd, shell=True) return True
def kill(simfolder): """ Kill simulation Warning: On pbs, kills all your jobs (ignores --project)! """ scheduler = utils.get_scheduler(simfolder) sim_name = utils.get_simulation_name(simfolder, "{}/stosim.conf".format(simfolder)) if scheduler == 'fjd': subprocess.call('fjd-recruiter --project {} fire'.format(sim_name), shell=True) elif scheduler == 'pbs': subprocess.call('qselect -u $USER | xargs qdel', shell=True) return True