def check_job_name_queued_or_running(job_name): with TemporaryDirectory() as temp_folder: user = getuser() # Launch job command = submit(job_command="sleep 600", job_name=job_name, time="700", memory=500, log_directory=temp_folder) job_id = _check_job_id(command) # Assert that the job has been launched try: running_jobs = queued_or_running_jobs(user=user) assert_in(job_name, running_jobs) finally: # Make sure to clean up even if there is a failure if _get_backend() == "slurm": subprocess.call(["scancel", job_id]) elif _get_backend() == "sge": subprocess.call(["qdel", job_id]) else: raise NotImplementedError("backend not implemented")
def make_scripts(self, cmds, job_names=None): sys.stdout.write("Writing Scripts...\n") scheduled_jobs = set(queued_or_running_jobs(user=self.user)) scripts = [] if job_names is None: job_names = [] for cmd in cmds: job_names.append(None) for index, cmd in enumerate(cmds): job_name = job_names[index] or self.__get_job_name(cmd) if job_name not in scheduled_jobs: script = submit(cmd, memory="%s" % self.memory, job_name=job_name, log_directory=self.slurm_logs, backend='slurm', time=0, shell_script="#!/usr/bin/env bash") script = script + " --partition=%s" % self.cluster_partition script = script + " --cpus-per-task=%s" % self.threads scripts.append(script) if self.verbose: sys.stdout.write(script + "\n") return scripts
def dispatch_to_slurm(commands): scripts = {} for job_name, command in commands.iteritems(): script = submit(command, job_name=job_name, time="0", memory="{}G".format(maxmem), backend="slurm", shell_script="#!/usr/bin/env bash") script += " --partition={}".format(partition) script += " --ntasks=1" script += " --cpus-per-task={}".format(maxcpu) script += " --mail-type=END,FAIL" script += " --mail-user={}".format(email) scripts[job_name] = script scheduled_jobs = set(queued_or_running_jobs()) for job_name, script in scripts.iteritems(): if job_name not in scheduled_jobs: if verbose: print("{}".format(script), file=sys.stdout) if not dry_run: subprocess.call(script, shell=True) else: print("{} already running, skipping".format(job_name), file=sys.stderr)
def dispatch(scripts): running = set(queued_or_running_jobs()) for job_name, script in scripts.iteritems(): if job_name not in running and not output_exists(script): subprocess.call(script, shell=True) else: print("Job already underway or completed: {}".format(job_name), file=sys.stderr)
def test_queued_or_running_jobs_nobackend(): """Test queued or running whenever no backend is available.""" # Note that we can't use _get_backend since the user might # have set the CLUSTERLIB_BACKEND environment variable. if _which('qmod') is None and _which('scontrol') is None: # No backend available, thus no running job assert_equal(queued_or_running_jobs(), []) else: raise SkipTest("A backend is installed")
def main(): num_args = len(sys.argv) - 1 if num_args < 1: config_path = '../config.ini' elif num_args > 1: raise Exception('too many arguments: %d. %d expected' % (num_args, 1)) else: config_path = sys.argv[1] config_file = io.abspath2(config_path) np.random.seed(3463) config = io.load_config(config_file) model_list = io.get_model_list(config['input_path'], config['pkl_ext']) np.random.shuffle( model_list) # In case we don't finish at least random subset # model_list = model_list[:5] # TODO remove, test only assert (all(io.is_safe_name(ss) for ss in model_list)) print 'using models:' print model_list # Sort for reprodicibility sampler_list = sorted(BUILD_STEP_PM.keys() + BUILD_STEP_MC.keys()) print 'using samplers:' print sampler_list # Run n_chains in the outer loop since if process get killed we have less # chains but with even distribution over models and samplers. scheduled_jobs = set(queued_or_running_jobs()) for model_name in model_list: # Get the exact samples run_experiment(config, model_name, config['exact_name']) # Get the sampler samples for i in xrange(config['n_chains']): # TODO could put ADVI init here to keep it fixed across samplers for sampler in sampler_list: t = time() job_name = "slurm-%s-%s-%d" % (model_name, sampler, i) cmd_line_args = (config_file, model_name, sampler) if job_name in scheduled_jobs: print '%s already in scheduled jobs, but running anyway' % job_name options = "-c 1 --job-name=%s -t 45:00 --mem=32gb --output %s.out" % ( job_name, job_name) end = "slurm_job_main.sh %s %s %s" % cmd_line_args command = "sbatch %s %s" % (options, end) print 'Executing:', command os.system(command) print 'wall time %fs' % (time() - t) print 'done'
def dispatch(commands): scripts = {} for job_name, command in commands.iteritems(): script = submit(command, job_name=job_name, time="0", memory=max_memory + "G", backend="slurm", shell_script="#!/usr/bin/env bash") script = script + " --partition=bigmemm" scripts[job_name] = script scheduled_jobs = set(queued_or_running_jobs()) for job_name, script in scripts.iteritems(): if job_name not in scheduled_jobs: sys.stdout.write("\n{}\n".format(script)) subprocess.call(script, shell=True) else: sys.stderr.write("{} running, skipping.\n".format(job_name))
def dispatch(filelist, commands): scripts = [] for i, command in enumerate(commands): script = submit(command, job_name="samtools_sort_%s" % filelist[i], time="0", memory=240000, backend="slurm", shell_script="#!/usr/bin/env bash") script = script + " --cpus-per-task=12" script = script + " --ntasks=1" script = script + " --partition=bigmemm" scripts.append(script) scheduled_jobs = set(queued_or_running_jobs()) for i, script in enumerate(scripts): if "samtools_sort_%s" % filelist[i] not in scheduled_jobs: sys.stdout.write("\n%s\n" % script) subprocess.call(script, shell=True) else: sys.stderr.write("Job name 'samtools_sort_%s' found in queued or \ running jobs list" % filelist[i])
def test_log_output(n_trials=30): """Test that log output is uniform accross scheduler.""" with TemporaryDirectory() as temp_folder: user = getuser() job_completed = False # Launch a sleepy SGE job job_name = 'ok_job' command = submit(job_command="echo ok", job_name=job_name, time="700", memory=500, log_directory=temp_folder) job_id = _check_job_id(command) try: for _ in range(n_trials): if job_name not in queued_or_running_jobs(user=user): # job has completed, let's check the output job_completed = True filename = "%s.%s.txt" % (job_name, job_id) assert_equal(os.listdir(temp_folder), [filename]) with open(op.join(temp_folder, filename)) as fhandle: assert_equal(fhandle.read().strip(), "ok") break else: # Let's wait a bit before retrying sleep(5) finally: # Make sure to clean up even if there is a failure if not job_completed: if _get_backend('auto') == 'slurm': subprocess.call(["scancel", job_id]) else: subprocess.call(["qdel", job_id]) raise AssertionError( "job %s (%s) has not completed after 5min." % (job_id, job_name))
def refresh(self): job_dict = load_notifications(self.exp_name) # Updating the false running jobs queued = frozenset(queued_or_running_jobs(self.user)) r_jobs = _filter(job_dict, __RUNNING__) p_jobs = _filter(job_dict, __PENDING__) i_jobs = _filter(job_dict, __PARTIAL__) launchables = {k for k in r_jobs.keys() if k not in queued} launchables.update({k for k in p_jobs.keys() if k not in queued}) launchable_jobs_update(self.exp_name, launchables) incompletes = {k for k in i_jobs.keys() if k not in queued} incomplete_jobs_update(self.exp_name, incompletes) # +--- Applying local change for comp_name in launchables: info = job_dict[comp_name] info[__STATE__] = __LAUNCHABLE__ for comp_name in incompletes: info = job_dict[comp_name] info[__STATE__] = __INCOMPLETE__ # Setting the refreshment self.job_dict = job_dict self.state_dict = _sort_by_state(self.job_dict)
# clusterlib_launcher.py import sys from clusterlib.scheduler import queued_or_running_jobs from clusterlib.scheduler import submit from clusterlib.storage import sqlite3_loads from clusterlib_main import NOSQL_PATH if __name__ == "__main__": scheduled_jobs = set(queued_or_running_jobs()) done_jobs = sqlite3_loads(NOSQL_PATH) for param in range(100): job_name = "job-param=%s" % param job_command = "%s clusterlib_main.py --param %s" % (sys.executable, param) if job_name not in scheduled_jobs and job_command not in done_jobs: script = submit(job_command, job_name=job_name) print(script) # Uncomment those lines to launch the jobs # import os # os.system(script)
# Argument parser parser = argparse.ArgumentParser() parser.add_argument('-d', '--debug', default=False, action="store_true") parser.add_argument('-v', '--verbose', default=False, action="store_true") parser.add_argument('-s', '--scores', default=False, action="store_true", help="compute scores") args = vars(parser.parse_args()) # Create log direcotyr if needed if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) # Get running jobs all_jobs_running = set(queued_or_running_jobs()) all_jobs_done = sqlite3_loads(get_sqlite3_path()) # Intialize some counter for reporting n_jobs_running = 0 n_jobs_done = 0 n_jobs_launched = 0 results = [] # Launch if necessary experiments for parameters in PARAMETER_GRID: job_hash = make_hash(parameters) if job_hash in all_jobs_running: n_jobs_running +=1
# clusterlib_launcher.py import sys from clusterlib.scheduler import queued_or_running_jobs from clusterlib.scheduler import submit from clusterlib.storage import sqlite3_loads from clusterlib_main import NOSQL_PATH if __name__ == "__main__": scheduled_jobs = set(queued_or_running_jobs()) done_jobs = sqlite3_loads(NOSQL_PATH) for param in range(100): job_name = "job-param=%s" % param job_command = "%s clusterlib.py --param %s" % (sys.executable, param) if job_name not in scheduled_jobs and job_command not in done_jobs: script = submit(job_command, job_name=job_name) print(script) # Uncomment those lines to launch the jobs # import os # os.system(script)
parser.add_argument('-d', '--debug', default=False, action="store_true") parser.add_argument('-v', '--verbose', default=False, action="store_true") parser.add_argument('-s', '--scores', default=False, action="store_true", help="compute scores") args = vars(parser.parse_args()) # Create log direcotyr if needed if not os.path.exists(LOG_DIRECTORY): os.makedirs(LOG_DIRECTORY) # Get running jobs all_jobs_running = set(queued_or_running_jobs()) all_jobs_done = sqlite3_loads(get_sqlite3_path()) # Intialize some counter for reporting n_jobs_running = 0 n_jobs_done = 0 n_jobs_launched = 0 results = [] # Launch if necessary experiments for parameters in PARAMETER_GRID: job_hash = make_hash(parameters) if job_hash in all_jobs_running: n_jobs_running += 1