def pcocc_setup(action, jobid, nolock, force): # Dont load user config, we run as a privileged user config = Config() if not nolock: config.lock_node() # Always raise verbosity for setup processes config.verbose = max(config.verbose, 1) if (action != 'delete' and (jobid or force)): raise click.UsageError('this option can only be used with delete') if action == 'init': config.load(process_type=ProcessType.OTHER) config.batch.init_node() config.config_node() elif action == 'cleanup': config.load(process_type=ProcessType.OTHER) config.cleanup_node() elif action == 'create': config.load(process_type=ProcessType.SETUP) config.tracker.reclaim(config.batch.list_all_jobs()) config.batch.create_resources() cluster = Cluster(config.batch.cluster_definition, resource_only=True) cluster.alloc_node_resources() elif action == 'delete': config.load(jobid=jobid, process_type=ProcessType.SETUP) config.tracker.cleanup_ref(config.batch.batchid) config.batch.delete_resources(force) cluster = Cluster(config.batch.cluster_definition, resource_only=True) cluster.free_node_resources() if not nolock: config.release_node()
def pcocc_alloc(restart_ckpt, alloc_script, batch_options, cluster_definition): try: config = load_config(process_type=ProcessType.OTHER) cluster_definition = ascii(cluster_definition) cluster = Cluster(cluster_definition) batch_options = list(batch_options) ckpt_opt = gen_ckpt_opt(restart_ckpt) alloc_opt = gen_alloc_script_opt(alloc_script) ret = config.batch.alloc( cluster, batch_options + get_license_opts(cluster) + ['-n', '%d' % (len(cluster.vms))], ['pcocc'] + build_verbose_opt() + ['internal', 'launcher', cluster_definition] + alloc_opt + ckpt_opt) sys.exit(ret) except PcoccError as err: handle_error(err)
def pcocc_launcher(restart_ckpt, wait, script, alloc_script, cluster_definition): config = load_config(process_type=ProcessType.LAUNCHER) batch = config.batch cluster_definition = ascii(cluster_definition) cluster = Cluster(cluster_definition) batch.populate_env() if restart_ckpt: ckpt_opt = ['-r', restart_ckpt] else: ckpt_opt = [] # TODO: provide a way for the user to plugin his own pre-run scripts here os.mkdir(os.path.join(batch.cluster_state_dir, 'slurm')) for path in os.listdir(helperdir): path = os.path.abspath(os.path.join(helperdir, path)) if os.path.isfile(path) and os.access(path, os.X_OK): subprocess.call(path, cwd=batch.cluster_state_dir) # TODO: This cmdline should be tunable s_pjob = batch.run(cluster, ['-Q', '-X', '--resv-port'], ['pcocc'] + build_verbose_opt() + ['internal', 'run'] + ckpt_opt) try: cluster.wait_host_config() except PcoccError as err: s_pjob.kill() handle_error(err) except KeyboardInterrupt: s_pjob.kill() handle_error(PcoccError('Cluster launch was interrupted')) batch.write_key("cluster/user", "definition", cluster_definition) term_sigfd = fake_signalfd([signal.SIGTERM, signal.SIGINT]) monitor_list = [s_pjob.pid] if script: if restart_ckpt: s_exec = subprocess.Popen(["pcocc", "exec"]) else: s_exec = subprocess.Popen(["pcocc", "exec", "-s", script]) if alloc_script: s_exec2 = subprocess.Popen(shlex.split(alloc_script)) monitor_list.append(s_exec2.pid) elif alloc_script: s_exec = subprocess.Popen(shlex.split(alloc_script)) else: shell_env = os.environ shell_env['PROMPT_COMMAND'] = 'echo -n "(pcocc/%d) "' % (batch.batchid) shell = os.getenv('SHELL', default='bash') s_exec = subprocess.Popen(shell, env=shell_env) monitor_list.append(s_exec.pid) while True: status, pid, _ = wait_or_term_child(monitor_list, signal.SIGTERM, term_sigfd, 40) if pid == s_pjob.pid: if status != 0: sys.stderr.write("The cluster terminated unexpectedly\n") else: sys.stderr.write("The cluster has shut down\n") # This is racy but helps if s_exec.poll() is None: s_exec.terminate() time.sleep(1) if s_exec.poll() is None: s_exec.kill() sys.exit(status >> 8) elif pid == s_exec.pid and not wait: sys.stderr.write("Terminating the cluster...\n") t = threading.Timer(40, wait_timeout, [s_pjob]) t.start() s_pjob.send_signal(signal.SIGINT) s_pjob.wait() t.cancel() sys.exit(status >> 8)
def pcocc_batch(restart_ckpt, batch_script, host_script, batch_options, cluster_definition): try: config = load_config(process_type=ProcessType.OTHER) cluster_definition = ascii(cluster_definition) cluster = Cluster(cluster_definition) batch_options = list(batch_options) ckpt_opt = gen_ckpt_opt(restart_ckpt) (wrpfile, wrpname) = tempfile.mkstemp() wrpfile = os.fdopen(wrpfile, 'w') if batch_script or host_script: launcher_opt = [] else: launcher_opt = ['-w'] wrpfile.write("""#!/bin/bash #SBATCH -o pcocc_%j.out #SBATCH -e pcocc_%j.err """) if batch_script: launcher_opt += ['-s', '"$TEMP_BATCH_SCRIPT"'] wrpfile.write(""" TEMP_BATCH_SCRIPT="/tmp/pcocc.batch.$$" cat <<"PCOCC_BATCH_SCRIPT_EOF" >> "${TEMP_BATCH_SCRIPT}" """) wrpfile.write(batch_script.read()) wrpfile.write(""" PCOCC_BATCH_SCRIPT_EOF chmod u+x "$TEMP_BATCH_SCRIPT" """) if host_script: launcher_opt += ['-E', '"$TEMP_HOST_SCRIPT"'] wrpfile.write(""" TEMP_HOST_SCRIPT="/tmp/pcocc.host.$$" cat <<"PCOCC_HOST_SCRIPT_EOF" >> "${TEMP_HOST_SCRIPT}" """) wrpfile.write(host_script.read()) wrpfile.write(""" PCOCC_HOST_SCRIPT_EOF chmod u+x "$TEMP_HOST_SCRIPT" """) wrpfile.write(""" PYTHONUNBUFFERED=true pcocc %s internal launcher %s %s %s & wait rm "$TEMP_BATCH_SCRIPT" 2>/dev/null rm "$TEMP_HOST_SCRIPT" 2>/dev/null """ % (' '.join(build_verbose_opt()), ' '.join(launcher_opt), ' '.join(ckpt_opt), cluster_definition)) wrpfile.close() ret = config.batch.batch( cluster, batch_options + get_license_opts(cluster) + ['-n', '%d' % (len(cluster.vms))], wrpname) sys.exit(ret) except PcoccError as err: handle_error(err)
def load_batch_cluster(): definition = Config().batch.read_key('cluster/user', 'definition', blocking=True) return Cluster(definition)