예제 #1
0
def add_engines(n=1, profile='iptest', total=False):
    """add a number of engines to a given profile.
    
    If total is True, then already running engines are counted, and only
    the additional engines necessary (if any) are started.
    """
    rc = Client(profile=profile)
    base = len(rc)
    
    if total:
        n = max(n - base, 0)
    
    eps = []
    for i in range(n):
        ep = TestProcessLauncher()
        ep.cmd_and_args = ipengine_cmd_argv + ['--profile=%s'%profile, '--log-level=50']
        ep.start()
        launchers.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base+n:
        if any([ ep.poll() is not None for ep in eps ]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time()-tic > 15:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #2
0
def add_engines(n=1, profile='iptest', total=False):
    """add a number of engines to a given profile.
    
    If total is True, then already running engines are counted, and only
    the additional engines necessary (if any) are started.
    """
    rc = Client(profile=profile)
    base = len(rc)

    if total:
        n = max(n - base, 0)

    eps = []
    for i in range(n):
        ep = TestProcessLauncher()
        ep.cmd_and_args = ipengine_cmd_argv + [
            '--profile=%s' % profile, '--log-level=50',
            '--InteractiveShell.colors=nocolor'
        ]
        ep.start()
        launchers.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base + n:
        if any([ep.poll() is not None for ep in eps]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time() - tic > 15:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #3
0
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.
    """
    if extra_params is None:
        extra_params = {}
    max_delay = start_wait * 60
    delay = 5 if extra_params.get("run_local") else 30
    max_tries = 10
    if profile is None:
        has_throwaway = True
        profile = create_throwaway_profile()
    else:
        # ensure we have an .ipython directory to prevent issues
        # creating it during parallel startup
        cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()",
               "profile", "create", "--parallel"] + _get_profile_args(profile)
        subprocess.check_call(cmd)
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    url_file = get_url_file(profile, cluster_id)

    while 1:
        try:
            if extra_params.get("run_local"):
                _start_local(cores_per_job, profile, cluster_id)
            else:
                _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        client = None
        slept = 0
        while not _is_up(url_file, num_jobs):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(url_file, timeout=60)
        yield _get_balanced_blocked_view(client, retries)
    finally:
        if client:
            client.close()
        _stop(profile, cluster_id)
        if has_throwaway:
            delete_profile(profile)
예제 #4
0
def _is_up(profile, cluster_id, n):
    try:
        #client = Client(profile=profile, cluster_id=cluster_id)
        client = Client(profile=profile)
        up = len(client.ids)
        client.close()
    except IOError, msg:
        return False
예제 #5
0
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.
    """
    if extra_params is None:
        extra_params = {}
    delay = 10
    max_delay = start_wait * 60
    # Increase default delay without changing max_delay for back compatibility
    delay = delay * 3
    max_tries = 10
    if profile is None:
        has_throwaway = True
        profile = create_throwaway_profile()
    else:
        # ensure we have an .ipython directory to prevent issues
        # creating it during parallel startup
        cmd = [sys.executable, "-c", "from IPython import start_ipython; start_ipython()",
               "profile", "create"]
        subprocess.check_call(cmd)
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    url_file = get_url_file(profile, cluster_id)

    while 1:
        try:
            _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        client = None
        slept = 0
        while not _is_up(url_file, num_jobs):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(url_file, timeout=60)
        yield _get_balanced_blocked_view(client, retries)
    finally:
        if client:
            client.close()
        _stop(profile, cluster_id)
        if has_throwaway:
            delete_profile(profile)
예제 #6
0
def _is_up(url_file, n):
    try:
        client = Client(url_file)
        up = len(client.ids)
        client.close()
    except IOError:
        return False
    else:
        return up >= n
예제 #7
0
def _is_up(url_file, n):
    try:
        client = Client(url_file, timeout=60)
        up = len(client.ids)
        client.close()
    except iperror.TimeoutError:
        return False
    except IOError:
        return False
    else:
        return up >= n
def _is_up(url_file, n):
    try:
        client = Client(url_file, timeout=60)
        up = len(client.ids)
        client.close()
    except iperror.TimeoutError:
        return False
    except IOError:
        return False
    else:
        return up >= n
예제 #9
0
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None):
    """Provide a view on an ipython cluster for processing.

      - scheduler: The type of cluster to start (lsf, sge, pbs, torque).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
      - start_wait: How long to wait for the cluster to startup, in minutes.
        Defaults to 16 minutes. Set to longer for slow starting clusters.
      - retries: Number of retries to allow for failed tasks.
    """
    if extra_params is None:
        extra_params = {}
    delay = 10
    max_delay = start_wait * 60
    max_tries = 10
    if profile is None:
        has_throwaway = True
        profile = create_throwaway_profile()
    else:
        has_throwaway = False
    num_tries = 0

    cluster_id = str(uuid.uuid4())
    url_file = get_url_file(profile, cluster_id)
    #cluster_id = ""
    while 1:
        try:
            _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        client = None
        slept = 0
        while not _is_up(url_file, num_jobs):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        client = Client(url_file)
        yield _get_balanced_blocked_view(client, retries)
    finally:
        if client:
            client.close()
        _stop(profile, cluster_id)
        if has_throwaway:
            delete_profile(profile)
예제 #10
0
def _nengines_up(url_file):
    "return the number of engines up"
    client = None
    try:
        client = Client(url_file, timeout=60)
        up = len(client.ids)
        client.close()
    # the controller isn't up yet
    except iperror.TimeoutError:
        return 0
    # the JSON file is not available to parse
    except IOError:
        return 0
    else:
        return up
예제 #11
0
def _nengines_up(url_file):
    "return the number of engines up"
    client = None
    try:
        client = Client(url_file, timeout=60)
        up = len(client.ids)
        client.close()
    # the controller isn't up yet
    except iperror.TimeoutError:
        return 0
    # the JSON file is not available to parse
    except IOError:
        return 0
    else:
        return up
 def test_hubresult_timestamps(self):
     self.minimum_engines(4)
     v = self.client[:]
     ar = v.apply_async(time.sleep, 0.25)
     ar.get(2)
     rc2 = Client(profile='iptest')
     # must have try/finally to close second Client, otherwise
     # will have dangling sockets causing problems
     try:
         time.sleep(0.25)
         hr = rc2.get_result(ar.msg_ids)
         self.assertTrue(hr.elapsed > 0., "got bad elapsed: %s" % hr.elapsed)
         hr.get(1)
         self.assertTrue(hr.wall_time < ar.wall_time + 0.2, "got bad wall_time: %s > %s" % (hr.wall_time, ar.wall_time))
         self.assertEqual(hr.serial_time, ar.serial_time)
     finally:
         rc2.close()
예제 #13
0
def add_engines(n=1, profile='iptest'):
    rc = Client(profile=profile)
    base = len(rc)
    eps = []
    for i in range(n):
        ep = Popen(['ipengine']+ ['--profile', profile, '--log-level', '10', '--log-to-file'], stdout=blackhole, stderr=STDOUT)
        # ep.start()
        processes.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base+n:
        if any([ ep.poll() is not None for ep in eps ]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time()-tic > 10:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #14
0
def cluster_view(parallel, config):
    """Provide a view on an ipython cluster for processing.

    parallel is a dictionary with:
      - scheduler: The type of cluster to start (lsf, sge).
      - num_jobs: Number of jobs to start.
      - cores_per_job: The number of cores to use for each job.
    """
    delay = 5
    max_delay = 300
    max_tries = 10
    profile = parallel["profile"]
    cluster_id = str(uuid.uuid1())
    num_tries = 0
    while 1:
        try:
            _start(parallel, profile, cluster_id)
            break
        except subprocess.CalledProcessError:
            if num_tries > max_tries:
                raise
            num_tries += 1
            time.sleep(delay)
    try:
        client = None
        slept = 0
        while not _is_up(profile, cluster_id, parallel["num_jobs"]):
            time.sleep(delay)
            slept += delay
            if slept > max_delay:
                raise IOError("Cluster startup timed out.")
        #client = Client(profile=profile, cluster_id=cluster_id)
        client = Client(profile=profile)
        # push config to all engines and force them to set up logging
        client[:]['config'] = config
        client[:].execute('from bcbio.log import setup_logging')
        client[:].execute('setup_logging(config)')
        client[:].execute('from bcbio.log import logger')
        yield client.load_balanced_view()
    finally:
        if client:
            client.close()
        _stop(profile, cluster_id)
예제 #15
0
def add_engines(n=1, profile='iptest'):
    rc = Client(profile=profile)
    base = len(rc)
    eps = []
    for i in range(n):
        ep = TestProcessLauncher()
        ep.cmd_and_args = ipengine_cmd_argv + ['--profile=%s'%profile, '--log-level=50']
        ep.start()
        launchers.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base+n:
        if any([ ep.poll() is not None for ep in eps ]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time()-tic > 10:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #16
0
def add_engines(n=1, profile='iptest'):
    rc = Client(profile=profile)
    base = len(rc)
    eps = []
    for i in range(n):
        ep = TestProcessLauncher()
        ep.cmd_and_args = ipengine_cmd_argv + ['profile=%s'%profile, 'log_level=50']
        ep.start()
        launchers.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base+n:
        if any([ ep.poll() is not None for ep in eps ]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time()-tic > 10:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #17
0
def add_engines(n=1, profile='iptest'):
    rc = Client(profile=profile)
    base = len(rc)
    eps = []
    for i in range(n):
        ep = Popen(
            ['ipengine'] +
            ['--profile', profile, '--log-level', '10', '--log-to-file'],
            stdout=blackhole,
            stderr=STDOUT)
        # ep.start()
        processes.append(ep)
        eps.append(ep)
    tic = time.time()
    while len(rc) < base + n:
        if any([ep.poll() is not None for ep in eps]):
            raise RuntimeError("A test engine failed to start.")
        elif time.time() - tic > 10:
            raise RuntimeError("Timeout waiting for engines to connect.")
        time.sleep(.1)
        rc.spin()
    rc.close()
    return eps
예제 #18
0
class parakat(object):
    """
    Uses the ipython clustering for running kat objects in parallel.
    
    To use this you must have installed ipyparallel, for example, with:
	    
        pip install ipyparallel

	Then yoy must start an ipython cluster on your computer.
    From a new terminal use the command:
        
        ipcluster start -n 4
		or:
		ipcluster start --n=4
        
    This will start a cluster with 4 workers.
    
    To run a kat object use:
    
        pk = parakat()
        pk.run(kat1)
        pk.run(kat2)
        pk.run(kat3)
        
        outs = pk.getResults()
    
    The list 'outs' will contain the katRun object you'd normal get if you 
    had just called, kat1.run(), etc. The results list is matched to order
    in which you run the kats.
    
    If you need to stop long running kat processes the chances are you will
    also need to kill the ipython cluster process, as sometimes they carry
    on running.
    """
    def __init__(self, **kwargs):
        self._rc = Client(**kwargs)
        self._lview = self._rc.load_balanced_view()
        self._lview.block = False
        self._results = []
        self._run_count = 0

    def run(self, kat, func=None, *args, **kwargs):
        if func is None:
            func = _run

        kat_IFO = None

        if hasattr(kat, 'IFO'):
            if hasattr(kat.IFO, "_IFO__kat"):
                kat.IFO._IFO__kat = None  # can't pickle stored kat
                kat_IFO = kat.IFO

        self._results.append(
            self._lview.apply_async(func, "".join(kat.generateKatScript()),
                                    os.getcwd(), kat_IFO, *args, **kwargs))

        if kat_IFO is not None:
            kat.IFO._IFO__kat = kat

        self._run_count += 1

    def getResults(self):
        out = []

        p = ProgressBar(maxval=self._run_count,
                        widgets=["Parallel jobs: ",
                                 Percentage(),
                                 Bar()])

        while not self._lview.wait(self._results, timeout=0.1):
            p.update(self._run_count -
                     self._lview.queue_status()['unassigned'])

        for done in self._results:
            out.append(done.get())

        return out

    def clear(self):
        del (self._results)
        self._results = []

    def close(self):
        self._rc.close()
예제 #19
0
class EngineManager(object):
    def __init__(self):
        self.profile = None
        self.started_controller = None
        self.started_engines = set()
        self._client = None

    def _select_profile(self):
        # See IPython.core.profileapp:list_profile_in()
        profiles = []
        for filename in os.listdir(get_ipython_dir()):
            if filename.startswith('profile_'):
                profiles.append(filename[8:])

        if profiles == ['default'] and not qt_available:
            self.profile = 'default'
        elif not qt_available:
            raise ValueError("'default' IPython profile does not exist "
                             "and PyQt4 is not available")
        else:
            self.profile = choose_profile(profiles)

    def ensure_controller(self, connect_only=False):
        """Make sure a controller is available, else start a local one.
        """
        if self._client:
            return self._client

        if self.profile is None:
            self._select_profile()
        if self.profile is None:
            return None
        print "parallelflow: using IPython profile %r" % self.profile

        try:
            self._client = Client(profile=self.profile)
            print "parallelflow: connected to controller"
            return self._client
        except error.TimeoutError:
            print "parallelflow: timeout when connecting to controller"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Start controller",
                        "Unable to connect to the configured IPython "
                        "controller. Do you want to start one?",
                        QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True
        except IOError:
            print "parallelflow: didn't find a controller to connect to"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Start controller",
                        "No controller is configured in this IPython profile. "
                        "Do you want to start one?",
                        QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True

        if start_ctrl:
            ctrl_pid = os.path.join(
                    locate_profile(self.profile),
                    'pid',
                    'ipcontroller.pid')
            if os.path.exists(ctrl_pid):
                os.remove(ctrl_pid)
            print "parallelflow: starting controller"
            proc, code = self.start_process(
                    lambda: os.path.exists(ctrl_pid),
                    sys.executable,
                    '-m',
                    'IPython.parallel.apps.ipcontrollerapp',
                    '--profile=%s' % self.profile)
            if code is not None:
                if qt_available:
                    QtGui.QMessageBox.critical(
                            None,
                            "Error",
                            "Controller exited with code %d" % code)
                print ("parallelflow: controller process exited with "
                       "code %d" % code)
                return None
            else:
                self.started_controller = proc
                print "parallelflow: controller started, connecting"
                self._client = Client(profile=self.profile)
                return self._client

        return None

    @staticmethod
    def start_process(condition, *args):
        """Executes a file and waits for a condition.
        """
        prev_dir = os.getcwd()
        os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir))
        try:
            p = subprocess.Popen(args)
        finally:
            os.chdir(prev_dir)
        if condition is None:
            return p, None
        else:
            while True:
                time.sleep(0.5)
                if condition():
                    return p, None
                res = p.poll()
                if res is not None:
                    return None, res

    def start_engines(self, nb=None, prompt="Number of engines to start"):
        """Start some engines locally
        """
        c = self.ensure_controller()
        if c is None:
            if qt_available:
                QtGui.QMessageBox.warning(
                        None,
                        "No controller",
                        "Can't start engines: couldn't connect to a "
                        "controller")
            print "parallelflow: no controller, not starting engines"
        else:
            if not nb and qt_available:
                nb, res = QtGui.QInputDialog.getInt(
                        None,
                        "Start engines",
                        prompt,
                        1,  # value
                        1,  # min
                        16) # max
                if not res:
                    return
            elif nb is None:
                nb = 1
            print "parallelflow: about to start %d engines" % nb
            if qt_available:
                bar = QtGui.QProgressDialog(
                        "Starting engines...",
                        None,
                        0, nb)
                def progress(n):
                    bar.setValue(n)
                bar.show()
            else:
                def progress(n): pass
            progress(0)

            init_engines = set(c.ids)
            # Start the processes
            starting = set()
            for i in xrange(nb):
                proc, res = self.start_process(
                        None,
                        sys.executable,
                        '-m',
                        'IPython.parallel.apps.ipengineapp',
                        '--profile=%s' % self.profile)
                starting.add(proc)
            # Wait for each one to either fail or connect
            failed = []
            connected = 0
            while connected < len(starting):
                connected = len(set(c.ids) - init_engines)
                progress(len(failed) + connected)
                time.sleep(0.5)
                for p in list(starting):
                    res = p.poll()
                    if res is not None:
                        failed.append(res)
                        starting.remove(p)
            if failed:
                nb_failed = len(failed)
                if nb_failed > 3:
                    failed = "%s, ..." % (', '.join('%d' % f for f in failed))
                else:
                    failed = ', '.join('%d' % f for f in failed)
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None,
                        "Error",
                        "%d engine(s) exited with codes: %s" % (
                        nb_failed, failed))
                print "parallelflow: %d engine(s) exited with codes: %s" % (
                        nb_failed, failed)
            self.started_engines.update(starting)

            if qt_available:
                bar.hide()
                bar.deleteLater()
            print "parallelflow: %d engines started" % (i + 1)

    def info(self):
        """Show some information on the cluster.
        """
        client = self.ensure_controller(connect_only=True)

        print "----- IPython information -----"
        print "profile: %s" % self.profile
        connected = client is not None
        print "connected to controller: %s" % (
                "yes" if connected else "no")
        st_ctrl = (self.started_controller is not None and
                        self.started_controller.poll() is None)
        print "controller started from VisTrails: %s" % (
                "running" if st_ctrl else "no")
        st_engines = sum(1 for p in self.started_engines if p.poll() is None)
        print "engines started from VisTrails: %d" % st_engines
        if client is not None:
            nb_engines = len(client.ids)
        else:
            nb_engines = None
        print "total engines in cluster: %s" % (
                nb_engines if nb_engines is not None else "(unknown)")
        if connected and client.ids:
            dview = client[:]
            with dview.sync_imports():
                import os
                import platform
                import socket
            engines = dview.apply_async(
                    eval,
                    '(os.getpid(), platform.system(), socket.getfqdn())'
            ).get_dict()
            engines = sorted(
                    engines.items(),
                    key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id))
            print "engines:"
            print "\tid\tsystem\tPID\tnode FQDN"
            print "\t--\t------\t---\t---------"
            for ip_id, (pid, system, fqdn) in engines:
                print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn)
        print ""

        if qt_available:
            dialog = QtGui.QDialog()
            layout = QtGui.QVBoxLayout()
            form = QtGui.QFormLayout()
            form.addRow(
                    "Profile:",
                    QtGui.QLabel(self.profile))
            form.addRow(
                    "Connected:",
                    QtGui.QLabel("yes" if connected else "no"))
            form.addRow(
                    "Controller started from VisTrails:",
                    QtGui.QLabel("running" if st_ctrl else "no"))
            form.addRow(
                    "Engines started from VisTrails:",
                    QtGui.QLabel(str(st_engines)))
            form.addRow(
                    "Total engines in cluster:",
                    QtGui.QLabel(str(nb_engines)
                                 if nb_engines is not None
                                 else "(unknown)"))
            layout.addLayout(form)
            if connected and client.ids:
                tree = QtGui.QTreeWidget()
                tree.setHeaderHidden(False)
                tree.setHeaderLabels(["IPython id", "PID", "System type"])
                engine_tree = dict()
                for ip_id, (pid, system, fqdn) in engines:
                    engine_tree.setdefault(fqdn, []).append(
                            (ip_id, pid, system))
                for fqdn, info in engine_tree.iteritems():
                    node = QtGui.QTreeWidgetItem([fqdn])
                    tree.addTopLevelItem(node)
                    tree.setFirstItemColumnSpanned(node, True)
                    for ip_id, pid, system in info:
                        node.addChild(QtGui.QTreeWidgetItem([
                                str(ip_id),
                                str(pid),
                                system]))
                for i in xrange(tree.columnCount()):
                    tree.resizeColumnToContents(i)
                tree.expandAll()
                layout.addWidget(tree)

            ok = QtGui.QPushButton("Ok")
            QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'),
                                   dialog, QtCore.SLOT('accept()'))
            layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter)
            dialog.setLayout(layout)
            dialog.exec_()

    def change_profile(self):
        self.cleanup()

        old_profile = self.profile
        self._select_profile()
        if not self.profile:
            self.profile = old_profile

        if self.profile != old_profile:
            # Here, the processes that were started but the user didn't want to
            # clean up are abandonned
            # They will continue running but later cleanups won't ask for these
            # ones
            self.started_engines = set()
            self.started_controller = None

    def cleanup(self):
        """Shut down the started processes (with user confirmation).
        """
        engines = sum(1 for p in self.started_engines if p.poll() is None)
        ctrl = (self.started_controller is not None and
                self.started_controller.poll() is None)
        print ("parallelflow: cleanup: %s, %d engines running" % (
               "controller running" if ctrl else "no controller",
               engines))

        hub_shutdown = False

        if ctrl:
            if qt_available:
                res = QtGui.QMessageBox.question(
                        None,
                        "Shutdown controller",
                        "The controller is still running. Do you want to stop "
                        "it?",
                        QtGui.QMessageBox.Yes,
                        QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                if self._client is not None:
                    self._client.shutdown(
                            targets='all',
                            restart=False,
                            hub=True,
                            block=False)
                    hub_shutdown = True
                    print "parallelflow: requested hub shutdown"
                else:
                    if self.started_controller.poll() is not None:
                        self.started_controller.terminate()
                        self.started_controller.wait()
                    print "parallelflow: controller terminated"
            self.started_controller = None

        if engines > 0 and not hub_shutdown:
            if qt_available:
                if self._client is not None:
                    total = " (among %d total)" % len(self._client.ids)
                else:
                    total = ''
                res = QtGui.QMessageBox.question(
                        None,
                        "Shutdown engines",
                        "%d engines started here%s are still "
                        "running. Do you want to stop them?" % (
                                engines,
                                total),
                        QtGui.QMessageBox.Yes,
                        QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                for engine in self.started_engines:
                    if engine.poll() is not None:
                        engine.terminate()
                        engine.wait()
                print ("parallelflow: %d engines terminated" %
                       len(self.started_engines))
            self.started_engines = set()

        if self._client is not None:
            print "parallelflow: closing client"
            self._client.close()
            self._client = None

    def shutdown_cluster(self):
        """Use the client to request a shutdown of the whole cluster.
        """
        client = self.ensure_controller(connect_only=True)
        if client is None:
            if qt_available:
                QtGui.QMessageBox.information(
                        None,
                        "Couldn't connect",
                        "Couldn't connect to a controller. Is the cluster "
                        "down already?")
            print ("parallelflow: shutdown_cluster requested, but could "
                   "not connect to a controller")
            return

        if qt_available:
            res = QtGui.QMessageBox.question(
                    None,
                    "Shutdown cluster",
                    "This will use the client connection to request the hub "
                    "and every engine to shutdown. Continue?",
                    QtGui.QMessageBox.Ok,
                    QtGui.QMessageBox.Cancel)
            if res != QtGui.QMessageBox.Ok:
                return

        self._client.shutdown(
                targets='all',
                restart=False,
                hub=True,
                block=False)
        print "parallelflow: cluster shutdown requested"
        self._client = None
예제 #20
0
class Cluster(object):
    def __init__(self, **kwargs):
        self.profile = kwargs.get("profile", "default")
        self.n = kwargs.get("cores", 1)
        self.delay = kwargs.get("delay", DEFAULT_DELAY)
        self.scheduler = kwargs.get("scheduler", "").upper()
        self.queue = kwargs.get("queue", "hsph")
        self._client = None
        self._view = None
        self._direct_view = None
        self._work = kwargs.get("work", ".")
        self._log_level = kwargs.get("log_level", 30)
        self._cluster_id = str(uuid.uuid1())

    def _ipcluster_start_common(self):
        cmd = [
            "ipcluster", "start", "--daemonize=True",
            "--delay=" + str(self.delay),
            "--IPClusterEngines.early_shutdown=180",
            "--log-level=" + str(self._log_level),
            "--profile=%s" % (self.profile),
            "--n=%d" % (self.n), "--debug"
        ]
        return cmd

    def _is_scheduler_supported(self):
        SUPPORTED_SCHEDULERS = ["LSF", "SGE"]
        return self.scheduler in SUPPORTED_SCHEDULERS

    def _start_with_scheduler(self):
        ns = "bcbio.distributed.ipython"
        engine_class = "Bcbio%sEngineSetLauncher" % self.scheduler
        controller_class = "Bcbio%sControllerLauncher" % self.scheduler
        cmd = self._ipcluster_start_common()
        cmd.extend([
            "--IPClusterStart.controller_launcher_class=%s.%s" %
            (ns, controller_class),
            "--IPClusterStart.engine_launcher_class=%s.%s" %
            (ns, engine_class),
            "--%sLauncher.queue=%s" % (self.scheduler, self.queue)
        ])
        subprocess.check_call(cmd)

    def _start_with_local(self):
        cmd = self._ipcluster_start_common()
        subprocess.check_call(cmd)

    def start(self):
        """starts the cluster and connects the client to the controller
        XXX: in the future, add "--cluster-id=" + self._cluster_id to
        this, to run each new cluster with a different ID, so we
        can reuse the same profile. right now there is a bug in
        ipython that doesn't support this

        """
        if self._is_scheduler_supported():
            self._start_with_scheduler()
        else:
            self._start_with_local()

    def client(self):
        """ returns a handle to the client """

        # add cluster_id=self._cluster_id to this call when the bug
        # is fixed in iPython
        if not self._client:
            self._client = Client(profile=self.profile)
            return self._client
        return self._client

    def new_client(self):
        if self._client:
            self._client.close()
        self._client = Client(profile=self.profile)

    def view(self):
        """ returns a blocking, load balanced view to the cluster engines """
        if self._view:
            return self._view

        if not self._client:
            self._client = Client(profile=self.profile)

        self._view = self._client.load_balanced_view()
        self._view.block = True
        return self._view

    def direct_view(self):
        if self._direct_view:
            return self._direct_view
        if not self.client:
            self._client = Client(profile=self.profile)
        self._direct_view = self._client[:]
        return self._direct_view

    def stop(self):
        parg = "--profile=%s" % (self.profile)
        # add carg = "--cluster-id=%s" % (self._cluster_id) when
        # this gets fixed in iPython
        return_code = subprocess.call(["ipcluster", "stop", parg])

    def is_up(self):
        """ returns True if the cluster is completely up and false otherwise """
        try:
            up = len(self.client().ids)
        except IOError:
            logger.info("Waiting for the controller to come up.")
            return False
        else:
            not_up = self.n - up
            if not_up > 0:
                logger.info("Waiting for %d engines to come up." % (not_up))
                return False
            else:
                return True
예제 #21
0
class EngineManager(object):
    def __init__(self):
        self.profile = None
        self.started_controller = None
        self.started_engines = set()
        self._client = None

    def _select_profile(self):
        # See IPython.core.profileapp:list_profile_in()
        profiles = []
        for filename in os.listdir(get_ipython_dir()):
            if filename.startswith('profile_'):
                profiles.append(filename[8:])

        if profiles == ['default'] and not qt_available:
            self.profile = 'default'
        elif not qt_available:
            raise ValueError("'default' IPython profile does not exist "
                             "and PyQt4 is not available")
        else:
            self.profile = choose_profile(profiles)

    def ensure_controller(self, connect_only=False):
        """Make sure a controller is available, else start a local one.
        """
        if self._client:
            return self._client

        if self.profile is None:
            self._select_profile()
        if self.profile is None:
            return None
        print "parallelflow: using IPython profile %r" % self.profile

        try:
            self._client = Client(profile=self.profile)
            print "parallelflow: connected to controller"
            return self._client
        except error.TimeoutError:
            print "parallelflow: timeout when connecting to controller"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Start controller",
                    "Unable to connect to the configured IPython "
                    "controller. Do you want to start one?",
                    QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True
        except IOError:
            print "parallelflow: didn't find a controller to connect to"
            if connect_only:
                start_ctrl = False
            elif qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Start controller",
                    "No controller is configured in this IPython profile. "
                    "Do you want to start one?",
                    QtGui.QMessageBox.Yes | QtGui.QMessageBox.No)
                start_ctrl = res == QtGui.QMessageBox.Yes
            else:
                start_ctrl = True

        if start_ctrl:
            ctrl_pid = os.path.join(locate_profile(self.profile), 'pid',
                                    'ipcontroller.pid')
            if os.path.exists(ctrl_pid):
                os.remove(ctrl_pid)
            print "parallelflow: starting controller"
            proc, code = self.start_process(
                lambda: os.path.exists(ctrl_pid), sys.executable, '-m',
                'IPython.parallel.apps.ipcontrollerapp',
                '--profile=%s' % self.profile)
            if code is not None:
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None, "Error", "Controller exited with code %d" % code)
                print(
                    "parallelflow: controller process exited with "
                    "code %d" % code)
                return None
            else:
                self.started_controller = proc
                print "parallelflow: controller started, connecting"
                self._client = Client(profile=self.profile)
                return self._client

        return None

    @staticmethod
    def start_process(condition, *args):
        """Executes a file and waits for a condition.
        """
        prev_dir = os.getcwd()
        os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir))
        try:
            p = subprocess.Popen(args)
        finally:
            os.chdir(prev_dir)
        if condition is None:
            return p, None
        else:
            while True:
                time.sleep(0.5)
                if condition():
                    return p, None
                res = p.poll()
                if res is not None:
                    return None, res

    def start_engines(self, nb=None, prompt="Number of engines to start"):
        """Start some engines locally
        """
        c = self.ensure_controller()
        if c is None:
            if qt_available:
                QtGui.QMessageBox.warning(
                    None, "No controller",
                    "Can't start engines: couldn't connect to a "
                    "controller")
            print "parallelflow: no controller, not starting engines"
        else:
            if not nb and qt_available:
                nb, res = QtGui.QInputDialog.getInt(
                    None,
                    "Start engines",
                    prompt,
                    1,  # value
                    1,  # min
                    16)  # max
                if not res:
                    return
            elif nb is None:
                nb = 1
            print "parallelflow: about to start %d engines" % nb
            if qt_available:
                bar = QtGui.QProgressDialog("Starting engines...", None, 0, nb)

                def progress(n):
                    bar.setValue(n)

                bar.show()
            else:

                def progress(n):
                    pass

            progress(0)

            init_engines = set(c.ids)
            # Start the processes
            starting = set()
            for i in xrange(nb):
                proc, res = self.start_process(
                    None, sys.executable, '-m',
                    'IPython.parallel.apps.ipengineapp',
                    '--profile=%s' % self.profile)
                starting.add(proc)
            # Wait for each one to either fail or connect
            failed = []
            connected = 0
            while connected < len(starting):
                connected = len(set(c.ids) - init_engines)
                progress(len(failed) + connected)
                time.sleep(0.5)
                for p in list(starting):
                    res = p.poll()
                    if res is not None:
                        failed.append(res)
                        starting.remove(p)
            if failed:
                nb_failed = len(failed)
                if nb_failed > 3:
                    failed = "%s, ..." % (', '.join('%d' % f for f in failed))
                else:
                    failed = ', '.join('%d' % f for f in failed)
                if qt_available:
                    QtGui.QMessageBox.critical(
                        None, "Error", "%d engine(s) exited with codes: %s" %
                        (nb_failed, failed))
                print "parallelflow: %d engine(s) exited with codes: %s" % (
                    nb_failed, failed)
            self.started_engines.update(starting)

            if qt_available:
                bar.hide()
                bar.deleteLater()
            print "parallelflow: %d engines started" % nb

    def info(self):
        """Show some information on the cluster.
        """
        client = self.ensure_controller(connect_only=True)

        print "----- IPython information -----"
        print "profile: %s" % self.profile
        connected = client is not None
        print "connected to controller: %s" % ("yes" if connected else "no")
        st_ctrl = (self.started_controller is not None
                   and self.started_controller.poll() is None)
        print "controller started from VisTrails: %s" % ("running"
                                                         if st_ctrl else "no")
        st_engines = sum(1 for p in self.started_engines if p.poll() is None)
        print "engines started from VisTrails: %d" % st_engines
        if client is not None:
            nb_engines = len(client.ids)
        else:
            nb_engines = None
        print "total engines in cluster: %s" % (nb_engines if nb_engines
                                                is not None else "(unknown)")
        if connected and client.ids:
            dview = client[:]
            with dview.sync_imports():
                import os
                import platform
                import socket
            engines = dview.apply_async(
                eval,
                '(os.getpid(), platform.system(), socket.getfqdn())').get_dict(
                )
            engines = sorted(engines.items(),
                             key=lambda (ip_id, (pid, system, fqdn)):
                             (fqdn, ip_id))
            print "engines:"
            print "\tid\tsystem\tPID\tnode FQDN"
            print "\t--\t------\t---\t---------"
            for ip_id, (pid, system, fqdn) in engines:
                print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn)
        print ""

        if qt_available:
            dialog = QtGui.QDialog()
            layout = QtGui.QVBoxLayout()
            form = QtGui.QFormLayout()
            form.addRow("Profile:", QtGui.QLabel(self.profile))
            form.addRow("Connected:",
                        QtGui.QLabel("yes" if connected else "no"))
            form.addRow("Controller started from VisTrails:",
                        QtGui.QLabel("running" if st_ctrl else "no"))
            form.addRow("Engines started from VisTrails:",
                        QtGui.QLabel(str(st_engines)))
            form.addRow(
                "Total engines in cluster:",
                QtGui.QLabel(
                    str(nb_engines) if nb_engines is not None else "(unknown)")
            )
            layout.addLayout(form)
            if connected and client.ids:
                tree = QtGui.QTreeWidget()
                tree.setHeaderHidden(False)
                tree.setHeaderLabels(["IPython id", "PID", "System type"])
                engine_tree = dict()
                for ip_id, (pid, system, fqdn) in engines:
                    engine_tree.setdefault(fqdn, []).append(
                        (ip_id, pid, system))
                for fqdn, info in engine_tree.iteritems():
                    node = QtGui.QTreeWidgetItem([fqdn])
                    tree.addTopLevelItem(node)
                    tree.setFirstItemColumnSpanned(node, True)
                    for ip_id, pid, system in info:
                        node.addChild(
                            QtGui.QTreeWidgetItem(
                                [str(ip_id), str(pid), system]))
                for i in xrange(tree.columnCount()):
                    tree.resizeColumnToContents(i)
                tree.expandAll()
                layout.addWidget(tree)

            ok = QtGui.QPushButton("Ok")
            QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog,
                                   QtCore.SLOT('accept()'))
            layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter)
            dialog.setLayout(layout)
            dialog.exec_()

    def change_profile(self):
        self.cleanup()

        old_profile = self.profile
        self._select_profile()
        if not self.profile:
            self.profile = old_profile

        if self.profile != old_profile:
            # Here, the processes that were started but the user didn't want to
            # clean up are abandonned
            # They will continue running but later cleanups won't ask for these
            # ones
            self.started_engines = set()
            self.started_controller = None

    def cleanup(self):
        """Shut down the started processes (with user confirmation).
        """
        engines = sum(1 for p in self.started_engines if p.poll() is None)
        ctrl = (self.started_controller is not None
                and self.started_controller.poll() is None)
        print("parallelflow: cleanup: %s, %d engines running" %
              ("controller running" if ctrl else "no controller", engines))

        hub_shutdown = False

        if ctrl:
            if qt_available:
                res = QtGui.QMessageBox.question(
                    None, "Shutdown controller",
                    "The controller is still running. Do you want to stop "
                    "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                if self._client is not None:
                    self._client.shutdown(targets='all',
                                          restart=False,
                                          hub=True,
                                          block=False)
                    hub_shutdown = True
                    print "parallelflow: requested hub shutdown"
                else:
                    if self.started_controller.poll() is not None:
                        self.started_controller.terminate()
                        self.started_controller.wait()
                    print "parallelflow: controller terminated"
            self.started_controller = None

        if engines > 0 and not hub_shutdown:
            if qt_available:
                if self._client is not None:
                    total = " (among %d total)" % len(self._client.ids)
                else:
                    total = ''
                res = QtGui.QMessageBox.question(
                    None, "Shutdown engines",
                    "%d engines started here%s are still "
                    "running. Do you want to stop them?" % (engines, total),
                    QtGui.QMessageBox.Yes, QtGui.QMessageBox.No)
                res = res != QtGui.QMessageBox.No
            else:
                res = True
            if res:
                for engine in self.started_engines:
                    if engine.poll() is not None:
                        engine.terminate()
                        engine.wait()
                print("parallelflow: %d engines terminated" %
                      len(self.started_engines))
            self.started_engines = set()

        if self._client is not None:
            print "parallelflow: closing client"
            self._client.close()
            self._client = None

    def shutdown_cluster(self):
        """Use the client to request a shutdown of the whole cluster.
        """
        client = self.ensure_controller(connect_only=True)
        if client is None:
            if qt_available:
                QtGui.QMessageBox.information(
                    None, "Couldn't connect",
                    "Couldn't connect to a controller. Is the cluster "
                    "down already?")
            print(
                "parallelflow: shutdown_cluster requested, but could "
                "not connect to a controller")
            return

        if qt_available:
            res = QtGui.QMessageBox.question(
                None, "Shutdown cluster",
                "This will use the client connection to request the hub "
                "and every engine to shutdown. Continue?",
                QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel)
            if res != QtGui.QMessageBox.Ok:
                return

        self._client.shutdown(targets='all',
                              restart=False,
                              hub=True,
                              block=False)
        print "parallelflow: cluster shutdown requested"
        self._client = None
예제 #22
0
파일: __init__.py 프로젝트: roryk/bipy
class Cluster(object):
    def __init__(self, **kwargs):
        self.profile = kwargs.get("profile", "default")
        self.n = kwargs.get("cores", 1)
        self.delay = kwargs.get("delay", DEFAULT_DELAY)
        self.scheduler = kwargs.get("scheduler", "").upper()
        self.queue = kwargs.get("queue", "hsph")
        self._client = None
        self._view = None
        self._direct_view = None
        self._work = kwargs.get("work", ".")
        self._log_level = kwargs.get("log_level", 30)
        self._cluster_id = str(uuid.uuid1())

    def _ipcluster_start_common(self):
        cmd = [
            "ipcluster",
            "start",
            "--daemonize=True",
            "--delay=" + str(self.delay),
            "--IPClusterEngines.early_shutdown=180",
            "--log-level=" + str(self._log_level),
            "--profile=%s" % (self.profile),
            "--n=%d" % (self.n),
            "--debug",
        ]
        return cmd

    def _is_scheduler_supported(self):
        SUPPORTED_SCHEDULERS = ["LSF", "SGE"]
        return self.scheduler in SUPPORTED_SCHEDULERS

    def _start_with_scheduler(self):
        ns = "bcbio.distributed.ipython"
        engine_class = "Bcbio%sEngineSetLauncher" % self.scheduler
        controller_class = "Bcbio%sControllerLauncher" % self.scheduler
        cmd = self._ipcluster_start_common()
        cmd.extend(
            [
                "--IPClusterStart.controller_launcher_class=%s.%s" % (ns, controller_class),
                "--IPClusterStart.engine_launcher_class=%s.%s" % (ns, engine_class),
                "--%sLauncher.queue=%s" % (self.scheduler, self.queue),
            ]
        )
        subprocess.check_call(cmd)

    def _start_with_local(self):
        cmd = self._ipcluster_start_common()
        subprocess.check_call(cmd)

    def start(self):
        """starts the cluster and connects the client to the controller
        XXX: in the future, add "--cluster-id=" + self._cluster_id to
        this, to run each new cluster with a different ID, so we
        can reuse the same profile. right now there is a bug in
        ipython that doesn't support this

        """
        if self._is_scheduler_supported():
            self._start_with_scheduler()
        else:
            self._start_with_local()

    def client(self):
        """ returns a handle to the client """

        # add cluster_id=self._cluster_id to this call when the bug
        # is fixed in iPython
        if not self._client:
            self._client = Client(profile=self.profile)
            return self._client
        return self._client

    def new_client(self):
        if self._client:
            self._client.close()
        self._client = Client(profile=self.profile)

    def view(self):
        """ returns a blocking, load balanced view to the cluster engines """
        if self._view:
            return self._view

        if not self._client:
            self._client = Client(profile=self.profile)

        self._view = self._client.load_balanced_view()
        self._view.block = True
        return self._view

    def direct_view(self):
        if self._direct_view:
            return self._direct_view
        if not self.client:
            self._client = Client(profile=self.profile)
        self._direct_view = self._client[:]
        return self._direct_view

    def stop(self):
        parg = "--profile=%s" % (self.profile)
        # add carg = "--cluster-id=%s" % (self._cluster_id) when
        # this gets fixed in iPython
        return_code = subprocess.call(["ipcluster", "stop", parg])

    def is_up(self):
        """ returns True if the cluster is completely up and false otherwise """
        try:
            up = len(self.client().ids)
        except IOError:
            logger.info("Waiting for the controller to come up.")
            return False
        else:
            not_up = self.n - up
            if not_up > 0:
                logger.info("Waiting for %d engines to come up." % (not_up))
                return False
            else:
                return True