def use_dill(self): """Expand serialization support with dill adds support for closures, etc. This calls IPython.utils.pickleutil.use_dill() here and on each engine. """ pickleutil.use_dill() return self.apply(pickleutil.use_dill)
def __init__(self, scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None, direct=False): self.stopped = False self.profile = profile num_jobs = int(num_jobs) cores_per_job = int(cores_per_job) start_wait = int(start_wait) if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 if extra_params.get("run_local") else 30 max_tries = 10 _create_base_ipython_dirs() if self.profile is None: self.has_throwaway = True self.profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel"] + _get_profile_args(self.profile) subprocess.check_call(cmd) self.has_throwaway = False num_tries = 0 self.cluster_id = str(uuid.uuid4()) url_file = get_url_file(self.profile, self.cluster_id) while 1: try: if extra_params.get("run_local"): _start_local(num_jobs, self.profile, self.cluster_id) else: _start(scheduler, self.profile, queue, num_jobs, cores_per_job, self.cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: self.client = None need_engines = 1 # Start using cluster when this many engines are up slept = 0 max_up = 0 up = 0 while up < need_engines: up = _nengines_up(url_file) if up < max_up: print ("Engine(s) that were up have shutdown prematurely. " "Aborting cluster startup.") _stop(self.profile, self.cluster_id) sys.exit(1) max_up = up time.sleep(delay) slept += delay if slept > max_delay: raise IOError(""" The cluster startup timed out. This could be for a couple of reasons. The most common reason is that the queue you are submitting jobs to is oversubscribed. You can check if this is what is happening by trying again, and watching to see if jobs are in a pending state or a running state when the startup times out. If they are in the pending state, that means we just need to wait longer for them to start, which you can specify by passing the --timeout parameter, in minutes. The second reason is that there is a problem with the controller and engine jobs being submitted to the scheduler. In the directory you ran from, you should see files that are named YourScheduler_enginesABunchOfNumbers and YourScheduler_controllerABunchOfNumbers. If you submit one of those files manually to your scheduler (for example bsub < YourScheduler_controllerABunchOfNumbers) You will get a more helpful error message that might help you figure out what is going wrong. The third reason is that you need to submit your bcbio_nextgen.py job itself as a job; bcbio-nextgen needs to run on a compute node, not the login node. So the command you use to run bcbio-nextgen should be submitted as a job to the scheduler. You can diagnose this because the controller and engine jobs will be in the running state, but the cluster will still timeout. Finally, it may be an issue with how the cluster is configured-- the controller and engine jobs are unable to talk to each other. They need to be able to open ports on the machines each of them are running on in order to work. You can diagnose this as the possible issue by if you have submitted the bcbio-nextgen job to the scheduler, the bcbio-nextgen main job and the controller and engine jobs are all in a running state and the cluster still times out. This will likely to be something that you'll have to talk to the administrators of the cluster you are using about. If you need help debugging, please post an issue here and we'll try to help you with the detective work: https://github.com/roryk/ipython-cluster-helper/issues """) self.client = Client(url_file, timeout=60) if direct: self.view = _get_direct_view(self.client, retries) else: self.view = _get_balanced_blocked_view(self.client, retries) self.view.clusterhelper = {"profile": self.profile, "cluster_id": self.cluster_id} if dill: pickleutil.use_dill() self.view.apply(pickleutil.use_dill) except: self.stop() raise
def cluster_view( queue, num_jobs, sshhostname=None, sshuser=None, sshport=22, sshkey=None, sshpassword=None, executable=sys.executable, profile_in_work_dir=False, cluster='Bcbio', work_dir=None, scheduler='sge', cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None, direct=False ): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. - ssh_client: A connected paramiko.client.SSHClient instance - executable: the path to the executable """ if sshhostname is not None: sshserver = sshhostname if sshuser is not None: sshserver = "{}@{}".format(sshuser, sshhostname) if sshport is not None: sshserver = "{}:{}".format(sshserver, sshport) ssh_client = SSHClient() ssh_client.load_system_host_keys() ssh_client.connect( hostname=sshhostname, port=sshport, username=sshuser, key_filename=sshkey, ) num_jobs = int(num_jobs) cores_per_job = int(cores_per_job) start_wait = int(start_wait) if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 if extra_params.get("run_local") else 15 max_tries = 10 if profile is None: has_throwaway = True profile, profile_dir = create_throwaway_profile( executable, ssh_client, profile_in_work_dir, work_dir ) print("Created profile {}".format(profile)) sys.stdout.flush() else: if ssh_client is None: if os.path.isdir(profile) and os.path.isabs(profile): # Return full_path if one is given profile_dir = profile else: profile_dir = locate_profile(profile) else: remote_cmd = ( '{0} -E -c ' '"from IPython.utils.path import locate_profile; ' 'import os; ' 'print(\'{1}\' if os.path.isdir(\'{1}\') and os.path.isabs(\'{1}\') ' 'else locate_profile(\'{1}\'))"' ).format(executable, profile) profile_dir = ssh_client.exec_command( 'source ~/.profile > /dev/null; ' + remote_cmd )[1].read().decode('utf-8').strip() # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel"] + _get_profile_args( profile, profile_dir ) if ssh_client is None: subprocess.check_call(cmd) else: remote_cmd = ' '.join([ '"{}"'.format(item) if ' ' in item else item for item in cmd ]) ssh_client.exec_command( 'source ~/.profile; ' + remote_cmd ) has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) print("Cluster profile: {}".format(profile)) print("Cluster profile directory: {}".format(profile_dir)) print("Cluster ID: {}".format(cluster_id)) sys.stdout.flush() while 1: try: if extra_params.get("run_local"): _start_local(num_jobs, profile, profile_dir, cluster_id) else: _start( scheduler, profile, profile_dir, queue, num_jobs, cores_per_job, cluster_id, extra_params, executable, ssh_client, cluster=cluster, work_dir=work_dir ) print("Cluster started.") sys.stdout.flush() break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) print("Retry to start cluster...") sys.stdout.flush() client = None try: url_file = get_url_file( profile, profile_dir, cluster_id, executable, ssh_client=ssh_client, timeout=start_wait * 60 ) print("URL file: {}".format(url_file)) sys.stdout.flush() need_engines = 1 # Start using cluster when this many engines are up client = None slept = 0 max_up = 0 up = 0 while up < need_engines: up = _nengines_up( url_file, sshserver=sshserver, sshkey=sshkey, sshpassword=sshpassword ) print("{} engines up.".format(up)) sys.stdout.flush() if up < max_up: print( "Engine(s) that were up have shutdown prematurely. " "Aborting cluster startup." ) _stop(profile, profile_dir, cluster_id, executable, ssh_client) sys.exit(1) max_up = up time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client( url_file, timeout=60, sshserver=sshserver, sshkey=sshkey, sshpassword=sshpassword ) if direct: view = _get_direct_view(client, retries) else: view = _get_balanced_blocked_view(client, retries) view.clusterhelper = { "profile": profile, "cluster_id": cluster_id, "client": client, } if dill: pickleutil.use_dill() view.apply(pickleutil.use_dill) yield view finally: if client: _shutdown(client) _stop(profile, profile_dir, cluster_id, executable, ssh_client) if has_throwaway: delete_profile(profile_dir, cluster_id, ssh_client) ssh_client.close()
import logging from collections import namedtuple, defaultdict from .qt import * from IPython.qt.base_frontend_mixin import BaseFrontendMixin from IPython.qt.inprocess import QtInProcessKernelManager as KernelManager from IPython.qt.console.ansi_code_processor import QtAnsiCodeProcessor from IPython.parallel import Client, TimeoutError, RemoteError from IPython.utils.pickleutil import use_dill use_dill() from datetime import datetime import re import os import sys from subprocess import Popen from IPython.parallel.apps import ipclusterapp from matplotlib import rcParams # Kernel is busy but not because of us STATUS_BLOCKED = -1 # Normal statuses STATUS_READY = 0 STATUS_RUNNING = 1 STATUS_COMPLETE = 2
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None, direct=False): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. """ num_jobs = int(num_jobs) cores_per_job = int(cores_per_job) start_wait = int(start_wait) if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 if extra_params.get("run_local") else 30 max_tries = 10 if profile is None: has_throwaway = True profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel"] + _get_profile_args(profile) subprocess.check_call(cmd) has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) url_file = get_url_file(profile, cluster_id) while 1: try: if extra_params.get("run_local"): _start_local(num_jobs, profile, cluster_id) else: _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: need_engines = 1 # Start using cluster when this many engines are up client = None slept = 0 max_up = 0 up = 0 while up < need_engines: up = _nengines_up(url_file) if up < max_up: print ("Engine(s) that were up have shutdown prematurely. " "Aborting cluster startup.") _stop(profile, cluster_id) sys.exit(1) max_up = up time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client(url_file, timeout=60) if direct: view = _get_direct_view(client, retries) else: view = _get_balanced_blocked_view(client, retries) if dill: pickleutil.use_dill() view.apply(pickleutil.use_dill) yield view finally: if client: _shutdown(client) _stop(profile, cluster_id) if has_throwaway: delete_profile(profile)
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None, direct=False): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. """ num_jobs = int(num_jobs) cores_per_job = int(cores_per_job) start_wait = int(start_wait) if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 if extra_params.get("run_local") else 30 max_tries = 10 if profile is None: has_throwaway = True profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [ sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel" ] + _get_profile_args(profile) subprocess.check_call(cmd) has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) url_file = get_url_file(profile, cluster_id) while 1: try: if extra_params.get("run_local"): _start_local(num_jobs, profile, cluster_id) else: _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: need_engines = 1 # Start using cluster when this many engines are up client = None slept = 0 max_up = 0 up = 0 while up < need_engines: up = _nengines_up(url_file) if up < max_up: print( "Engine(s) that were up have shutdown prematurely. " "Aborting cluster startup.") _stop(profile, cluster_id) sys.exit(1) max_up = up time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client(url_file, timeout=60) if direct: view = _get_direct_view(client, retries) else: view = _get_balanced_blocked_view(client, retries) view.clusterhelper = {"profile": profile, "cluster_id": cluster_id} if dill: pickleutil.use_dill() view.apply(pickleutil.use_dill) yield view finally: if client: _shutdown(client) _stop(profile, cluster_id) if has_throwaway: delete_profile(profile)