Ejemplo n.º 1
0
 def use_dill(self):
     """Expand serialization support with dill
     
     adds support for closures, etc.
     
     This calls IPython.utils.pickleutil.use_dill() here and on each engine.
     """
     pickleutil.use_dill()
     return self.apply(pickleutil.use_dill)
Ejemplo n.º 2
0
 def use_dill(self):
     """Expand serialization support with dill
     
     adds support for closures, etc.
     
     This calls IPython.utils.pickleutil.use_dill() here and on each engine.
     """
     pickleutil.use_dill()
     return self.apply(pickleutil.use_dill)
Ejemplo n.º 3
0
    def __init__(self, scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None, direct=False):
        self.stopped = False
        self.profile = profile
        num_jobs = int(num_jobs)
        cores_per_job = int(cores_per_job)
        start_wait = int(start_wait)

        if extra_params is None:
            extra_params = {}
        max_delay = start_wait * 60
        delay = 5 if extra_params.get("run_local") else 30
        max_tries = 10
        _create_base_ipython_dirs()
        if self.profile is None:
            self.has_throwaway = True
            self.profile = create_throwaway_profile()
        else:
            # ensure we have an .ipython directory to prevent issues
            # creating it during parallel startup
            cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()",
                   "profile", "create", "--parallel"] + _get_profile_args(self.profile)
            subprocess.check_call(cmd)
            self.has_throwaway = False
        num_tries = 0

        self.cluster_id = str(uuid.uuid4())
        url_file = get_url_file(self.profile, self.cluster_id)

        while 1:
            try:
                if extra_params.get("run_local"):
                    _start_local(num_jobs, self.profile, self.cluster_id)
                else:
                    _start(scheduler, self.profile, queue, num_jobs, cores_per_job, self.cluster_id, extra_params)
                break
            except subprocess.CalledProcessError:
                if num_tries > max_tries:
                    raise
                num_tries += 1
                time.sleep(delay)

        try:
            self.client = None
            need_engines = 1  # Start using cluster when this many engines are up
            slept = 0
            max_up = 0
            up = 0
            while up < need_engines:
                up = _nengines_up(url_file)
                if up < max_up:
                    print ("Engine(s) that were up have shutdown prematurely. "
                           "Aborting cluster startup.")
                    _stop(self.profile, self.cluster_id)
                    sys.exit(1)
                max_up = up
                time.sleep(delay)
                slept += delay
                if slept > max_delay:
                    raise IOError("""

        The cluster startup timed out. This could be for a couple of reasons. The
        most common reason is that the queue you are submitting jobs to is
        oversubscribed. You can check if this is what is happening by trying again,
        and watching to see if jobs are in a pending state or a running state when
        the startup times out. If they are in the pending state, that means we just
        need to wait longer for them to start, which you can specify by passing
        the --timeout parameter, in minutes.

        The second reason is that there is a problem with the controller and engine
        jobs being submitted to the scheduler. In the directory you ran from,
        you should see files that are named YourScheduler_enginesABunchOfNumbers and
        YourScheduler_controllerABunchOfNumbers. If you submit one of those files
        manually to your scheduler (for example bsub < YourScheduler_controllerABunchOfNumbers)
        You will get a more helpful error message that might help you figure out what
        is going wrong.

        The third reason is that you need to submit your bcbio_nextgen.py job itself as a job;
        bcbio-nextgen needs to run on a compute node, not the login node. So the
        command you use to run bcbio-nextgen should be submitted as a job to
        the scheduler. You can diagnose this because the controller and engine
        jobs will be in the running state, but the cluster will still timeout.

        Finally, it may be an issue with how the cluster is configured-- the controller
        and engine jobs are unable to talk to each other. They need to be able to open
        ports on the machines each of them are running on in order to work. You
        can diagnose this as the possible issue by if you have submitted the bcbio-nextgen
        job to the scheduler, the bcbio-nextgen main job and the controller and
        engine jobs are all in a running state and the cluster still times out. This will
        likely to be something that you'll have to talk to the administrators of the cluster
        you are using about.

        If you need help debugging, please post an issue here and we'll try to help you
        with the detective work:

        https://github.com/roryk/ipython-cluster-helper/issues

                            """)
            self.client = Client(url_file, timeout=60)
            if direct:
                self.view = _get_direct_view(self.client, retries)
            else:
                self.view = _get_balanced_blocked_view(self.client, retries)
            self.view.clusterhelper = {"profile": self.profile,
                                       "cluster_id": self.cluster_id}
            if dill:
                pickleutil.use_dill()
                self.view.apply(pickleutil.use_dill)
        except:
            self.stop()
            raise
Ejemplo n.º 4
0
def cluster_view(
    queue, num_jobs,
    sshhostname=None, sshuser=None, sshport=22, sshkey=None, sshpassword=None,
    executable=sys.executable,
    profile_in_work_dir=False,
    cluster='Bcbio', work_dir=None,
    scheduler='sge', cores_per_job=1, profile=None,
    start_wait=16, extra_params=None, retries=None, direct=False
):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.

      - ssh_client: A connected paramiko.client.SSHClient instance
      - executable: the path to the executable
    """

    if sshhostname is not None:
        sshserver = sshhostname
        if sshuser is not None:
            sshserver = "{}@{}".format(sshuser, sshhostname)
        if sshport is not None:
            sshserver = "{}:{}".format(sshserver, sshport)

        ssh_client = SSHClient()
        ssh_client.load_system_host_keys()
        ssh_client.connect(
            hostname=sshhostname,
            port=sshport,
            username=sshuser,
            key_filename=sshkey,
        )

    num_jobs = int(num_jobs)
    cores_per_job = int(cores_per_job)
    start_wait = int(start_wait)

    if extra_params is None:
        extra_params = {}
    max_delay = start_wait * 60
    delay = 5 if extra_params.get("run_local") else 15
    max_tries = 10

    if profile is None:
        has_throwaway = True
        profile, profile_dir = create_throwaway_profile(
            executable, ssh_client, profile_in_work_dir, work_dir
        )
        print("Created profile {}".format(profile))
        sys.stdout.flush()
    else:
        if ssh_client is None:
            if os.path.isdir(profile) and os.path.isabs(profile):
                # Return full_path if one is given
                profile_dir = profile
            else:
                profile_dir = locate_profile(profile)
        else:
            remote_cmd = (
                '{0} -E -c '
                '"from IPython.utils.path import locate_profile; '
                'import os; '
                'print(\'{1}\' if os.path.isdir(\'{1}\') and os.path.isabs(\'{1}\') '
                'else locate_profile(\'{1}\'))"'
            ).format(executable, profile)

            profile_dir = ssh_client.exec_command(
                'source ~/.profile > /dev/null; ' + remote_cmd
            )[1].read().decode('utf-8').strip()

        # ensure we have an .ipython directory to prevent issues
        # creating it during parallel startup
        cmd = [executable, "-E", "-c",
               "from IPython import start_ipython; start_ipython()",
               "profile", "create", "--parallel"] + _get_profile_args(
                   profile, profile_dir
               )
        if ssh_client is None:
            subprocess.check_call(cmd)
        else:
            remote_cmd = ' '.join([
                '"{}"'.format(item) if ' ' in item else item
                for item in cmd
            ])
            ssh_client.exec_command(
                'source ~/.profile; ' + remote_cmd
            )
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    print("Cluster profile: {}".format(profile))
    print("Cluster profile directory: {}".format(profile_dir))
    print("Cluster ID: {}".format(cluster_id))
    sys.stdout.flush()

    while 1:
        try:
            if extra_params.get("run_local"):
                _start_local(num_jobs, profile, profile_dir, cluster_id)
            else:
                _start(
                    scheduler, profile, profile_dir, queue, num_jobs,
                    cores_per_job, cluster_id, extra_params,
                    executable, ssh_client, cluster=cluster, work_dir=work_dir
                )
                print("Cluster started.")
                sys.stdout.flush()
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
            print("Retry to start cluster...")
            sys.stdout.flush()

    client = None
    try:
        url_file = get_url_file(
            profile, profile_dir, cluster_id, executable,
            ssh_client=ssh_client, timeout=start_wait * 60
        )
        print("URL file: {}".format(url_file))
        sys.stdout.flush()

        need_engines = 1  # Start using cluster when this many engines are up
        client = None
        slept = 0
        max_up = 0
        up = 0
        while up < need_engines:
            up = _nengines_up(
                url_file,
                sshserver=sshserver, sshkey=sshkey, sshpassword=sshpassword
            )
            print("{} engines up.".format(up))
            sys.stdout.flush()
            if up < max_up:
                print(
                    "Engine(s) that were up have shutdown prematurely. "
                    "Aborting cluster startup."
                )
                _stop(profile, profile_dir, cluster_id, executable, ssh_client)
                sys.exit(1)
            max_up = up
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(
            url_file, timeout=60,
            sshserver=sshserver, sshkey=sshkey, sshpassword=sshpassword
        )
        if direct:
            view = _get_direct_view(client, retries)
        else:
            view = _get_balanced_blocked_view(client, retries)
        view.clusterhelper = {
            "profile": profile,
            "cluster_id": cluster_id,
            "client": client,
        }
        if dill:
            pickleutil.use_dill()
            view.apply(pickleutil.use_dill)
        yield view
    finally:
        if client:
            _shutdown(client)
        _stop(profile, profile_dir, cluster_id, executable, ssh_client)
        if has_throwaway:
            delete_profile(profile_dir, cluster_id, ssh_client)
        ssh_client.close()
Ejemplo n.º 5
0
import logging

from collections import namedtuple, defaultdict

from .qt import *

from IPython.qt.base_frontend_mixin import BaseFrontendMixin
from IPython.qt.inprocess import QtInProcessKernelManager as KernelManager
from IPython.qt.console.ansi_code_processor import QtAnsiCodeProcessor

from IPython.parallel import Client, TimeoutError, RemoteError
from IPython.utils.pickleutil import use_dill
use_dill()

from datetime import datetime
import re
import os
import sys
from subprocess import Popen
from IPython.parallel.apps import ipclusterapp

from matplotlib import rcParams

# Kernel is busy but not because of us
STATUS_BLOCKED = -1

# Normal statuses
STATUS_READY = 0
STATUS_RUNNING = 1
STATUS_COMPLETE = 2
Ejemplo n.º 6
0
import logging

from collections import namedtuple, defaultdict

from .qt import *

from IPython.qt.base_frontend_mixin import BaseFrontendMixin
from IPython.qt.inprocess import QtInProcessKernelManager as KernelManager
from IPython.qt.console.ansi_code_processor import QtAnsiCodeProcessor

from IPython.parallel import Client, TimeoutError, RemoteError
from IPython.utils.pickleutil import use_dill
use_dill()

from datetime import datetime
import re
import os
import sys
from subprocess import Popen
from IPython.parallel.apps import ipclusterapp

from matplotlib import rcParams

# Kernel is busy but not because of us
STATUS_BLOCKED = -1

# Normal statuses
STATUS_READY = 0
STATUS_RUNNING = 1
STATUS_COMPLETE = 2
Ejemplo n.º 7
0
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None, direct=False):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.
    """
    num_jobs = int(num_jobs)
    cores_per_job = int(cores_per_job)
    start_wait = int(start_wait)

    if extra_params is None:
        extra_params = {}
    max_delay = start_wait * 60
    delay = 5 if extra_params.get("run_local") else 30
    max_tries = 10
    if profile is None:
        has_throwaway = True
        profile = create_throwaway_profile()
    else:
        # ensure we have an .ipython directory to prevent issues
        # creating it during parallel startup
        cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()",
               "profile", "create", "--parallel"] + _get_profile_args(profile)
        subprocess.check_call(cmd)
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    url_file = get_url_file(profile, cluster_id)

    while 1:
        try:
            if extra_params.get("run_local"):
                _start_local(num_jobs, profile, cluster_id)
            else:
                _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        need_engines = 1  # Start using cluster when this many engines are up
        client = None
        slept = 0
        max_up = 0
        up = 0
        while up < need_engines:
            up = _nengines_up(url_file)
            if up < max_up:
                print ("Engine(s) that were up have shutdown prematurely. "
                       "Aborting cluster startup.")
                _stop(profile, cluster_id)
                sys.exit(1)
            max_up = up
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(url_file, timeout=60)
        if direct:
            view = _get_direct_view(client, retries)
        else:
            view = _get_balanced_blocked_view(client, retries)
        if dill:
            pickleutil.use_dill()
            view.apply(pickleutil.use_dill)
        yield view
    finally:
        if client:
            _shutdown(client)
        _stop(profile, cluster_id)
        if has_throwaway:
            delete_profile(profile)
Ejemplo n.º 8
0
def cluster_view(scheduler,
                 queue,
                 num_jobs,
                 cores_per_job=1,
                 profile=None,
                 start_wait=16,
                 extra_params=None,
                 retries=None,
                 direct=False):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.
    """
    num_jobs = int(num_jobs)
    cores_per_job = int(cores_per_job)
    start_wait = int(start_wait)

    if extra_params is None:
        extra_params = {}
    max_delay = start_wait * 60
    delay = 5 if extra_params.get("run_local") else 30
    max_tries = 10
    if profile is None:
        has_throwaway = True
        profile = create_throwaway_profile()
    else:
        # ensure we have an .ipython directory to prevent issues
        # creating it during parallel startup
        cmd = [
            sys.executable, "-E", "-c",
            "from IPython import start_ipython; start_ipython()", "profile",
            "create", "--parallel"
        ] + _get_profile_args(profile)
        subprocess.check_call(cmd)
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    url_file = get_url_file(profile, cluster_id)

    while 1:
        try:
            if extra_params.get("run_local"):
                _start_local(num_jobs, profile, cluster_id)
            else:
                _start(scheduler, profile, queue, num_jobs, cores_per_job,
                       cluster_id, extra_params)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        need_engines = 1  # Start using cluster when this many engines are up
        client = None
        slept = 0
        max_up = 0
        up = 0
        while up < need_engines:
            up = _nengines_up(url_file)
            if up < max_up:
                print(
                    "Engine(s) that were up have shutdown prematurely. "
                    "Aborting cluster startup.")
                _stop(profile, cluster_id)
                sys.exit(1)
            max_up = up
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(url_file, timeout=60)
        if direct:
            view = _get_direct_view(client, retries)
        else:
            view = _get_balanced_blocked_view(client, retries)
        view.clusterhelper = {"profile": profile, "cluster_id": cluster_id}
        if dill:
            pickleutil.use_dill()
            view.apply(pickleutil.use_dill)
        yield view
    finally:
        if client:
            _shutdown(client)
        _stop(profile, cluster_id)
        if has_throwaway:
            delete_profile(profile)