def add_engines(n=1, profile='iptest', total=False): """add a number of engines to a given profile. If total is True, then already running engines are counted, and only the additional engines necessary (if any) are started. """ rc = Client(profile=profile) base = len(rc) if total: n = max(n - base, 0) eps = [] for i in range(n): ep = TestProcessLauncher() ep.cmd_and_args = ipengine_cmd_argv + ['--profile=%s'%profile, '--log-level=50'] ep.start() launchers.append(ep) eps.append(ep) tic = time.time() while len(rc) < base+n: if any([ ep.poll() is not None for ep in eps ]): raise RuntimeError("A test engine failed to start.") elif time.time()-tic > 15: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def add_engines(n=1, profile='iptest', total=False): """add a number of engines to a given profile. If total is True, then already running engines are counted, and only the additional engines necessary (if any) are started. """ rc = Client(profile=profile) base = len(rc) if total: n = max(n - base, 0) eps = [] for i in range(n): ep = TestProcessLauncher() ep.cmd_and_args = ipengine_cmd_argv + [ '--profile=%s' % profile, '--log-level=50', '--InteractiveShell.colors=nocolor' ] ep.start() launchers.append(ep) eps.append(ep) tic = time.time() while len(rc) < base + n: if any([ep.poll() is not None for ep in eps]): raise RuntimeError("A test engine failed to start.") elif time.time() - tic > 15: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. """ if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 if extra_params.get("run_local") else 30 max_tries = 10 if profile is None: has_throwaway = True profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel"] + _get_profile_args(profile) subprocess.check_call(cmd) has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) url_file = get_url_file(profile, cluster_id) while 1: try: if extra_params.get("run_local"): _start_local(cores_per_job, profile, cluster_id) else: _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: client = None slept = 0 while not _is_up(url_file, num_jobs): time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client(url_file, timeout=60) yield _get_balanced_blocked_view(client, retries) finally: if client: client.close() _stop(profile, cluster_id) if has_throwaway: delete_profile(profile)
def _is_up(profile, cluster_id, n): try: #client = Client(profile=profile, cluster_id=cluster_id) client = Client(profile=profile) up = len(client.ids) client.close() except IOError, msg: return False
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. """ if extra_params is None: extra_params = {} delay = 10 max_delay = start_wait * 60 # Increase default delay without changing max_delay for back compatibility delay = delay * 3 max_tries = 10 if profile is None: has_throwaway = True profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [sys.executable, "-c", "from IPython import start_ipython; start_ipython()", "profile", "create"] subprocess.check_call(cmd) has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) url_file = get_url_file(profile, cluster_id) while 1: try: _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: client = None slept = 0 while not _is_up(url_file, num_jobs): time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client(url_file, timeout=60) yield _get_balanced_blocked_view(client, retries) finally: if client: client.close() _stop(profile, cluster_id) if has_throwaway: delete_profile(profile)
def _is_up(url_file, n): try: client = Client(url_file) up = len(client.ids) client.close() except IOError: return False else: return up >= n
def _is_up(url_file, n): try: client = Client(url_file, timeout=60) up = len(client.ids) client.close() except iperror.TimeoutError: return False except IOError: return False else: return up >= n
def cluster_view(scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None): """Provide a view on an ipython cluster for processing. - scheduler: The type of cluster to start (lsf, sge, pbs, torque). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. - start_wait: How long to wait for the cluster to startup, in minutes. Defaults to 16 minutes. Set to longer for slow starting clusters. - retries: Number of retries to allow for failed tasks. """ if extra_params is None: extra_params = {} delay = 10 max_delay = start_wait * 60 max_tries = 10 if profile is None: has_throwaway = True profile = create_throwaway_profile() else: has_throwaway = False num_tries = 0 cluster_id = str(uuid.uuid4()) url_file = get_url_file(profile, cluster_id) #cluster_id = "" while 1: try: _start(scheduler, profile, queue, num_jobs, cores_per_job, cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: client = None slept = 0 while not _is_up(url_file, num_jobs): time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") client = Client(url_file) yield _get_balanced_blocked_view(client, retries) finally: if client: client.close() _stop(profile, cluster_id) if has_throwaway: delete_profile(profile)
def _nengines_up(url_file): "return the number of engines up" client = None try: client = Client(url_file, timeout=60) up = len(client.ids) client.close() # the controller isn't up yet except iperror.TimeoutError: return 0 # the JSON file is not available to parse except IOError: return 0 else: return up
def test_hubresult_timestamps(self): self.minimum_engines(4) v = self.client[:] ar = v.apply_async(time.sleep, 0.25) ar.get(2) rc2 = Client(profile='iptest') # must have try/finally to close second Client, otherwise # will have dangling sockets causing problems try: time.sleep(0.25) hr = rc2.get_result(ar.msg_ids) self.assertTrue(hr.elapsed > 0., "got bad elapsed: %s" % hr.elapsed) hr.get(1) self.assertTrue(hr.wall_time < ar.wall_time + 0.2, "got bad wall_time: %s > %s" % (hr.wall_time, ar.wall_time)) self.assertEqual(hr.serial_time, ar.serial_time) finally: rc2.close()
def add_engines(n=1, profile='iptest'): rc = Client(profile=profile) base = len(rc) eps = [] for i in range(n): ep = Popen(['ipengine']+ ['--profile', profile, '--log-level', '10', '--log-to-file'], stdout=blackhole, stderr=STDOUT) # ep.start() processes.append(ep) eps.append(ep) tic = time.time() while len(rc) < base+n: if any([ ep.poll() is not None for ep in eps ]): raise RuntimeError("A test engine failed to start.") elif time.time()-tic > 10: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def cluster_view(parallel, config): """Provide a view on an ipython cluster for processing. parallel is a dictionary with: - scheduler: The type of cluster to start (lsf, sge). - num_jobs: Number of jobs to start. - cores_per_job: The number of cores to use for each job. """ delay = 5 max_delay = 300 max_tries = 10 profile = parallel["profile"] cluster_id = str(uuid.uuid1()) num_tries = 0 while 1: try: _start(parallel, profile, cluster_id) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: client = None slept = 0 while not _is_up(profile, cluster_id, parallel["num_jobs"]): time.sleep(delay) slept += delay if slept > max_delay: raise IOError("Cluster startup timed out.") #client = Client(profile=profile, cluster_id=cluster_id) client = Client(profile=profile) # push config to all engines and force them to set up logging client[:]['config'] = config client[:].execute('from bcbio.log import setup_logging') client[:].execute('setup_logging(config)') client[:].execute('from bcbio.log import logger') yield client.load_balanced_view() finally: if client: client.close() _stop(profile, cluster_id)
def add_engines(n=1, profile='iptest'): rc = Client(profile=profile) base = len(rc) eps = [] for i in range(n): ep = TestProcessLauncher() ep.cmd_and_args = ipengine_cmd_argv + ['--profile=%s'%profile, '--log-level=50'] ep.start() launchers.append(ep) eps.append(ep) tic = time.time() while len(rc) < base+n: if any([ ep.poll() is not None for ep in eps ]): raise RuntimeError("A test engine failed to start.") elif time.time()-tic > 10: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def add_engines(n=1, profile='iptest'): rc = Client(profile=profile) base = len(rc) eps = [] for i in range(n): ep = TestProcessLauncher() ep.cmd_and_args = ipengine_cmd_argv + ['profile=%s'%profile, 'log_level=50'] ep.start() launchers.append(ep) eps.append(ep) tic = time.time() while len(rc) < base+n: if any([ ep.poll() is not None for ep in eps ]): raise RuntimeError("A test engine failed to start.") elif time.time()-tic > 10: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
def add_engines(n=1, profile='iptest'): rc = Client(profile=profile) base = len(rc) eps = [] for i in range(n): ep = Popen( ['ipengine'] + ['--profile', profile, '--log-level', '10', '--log-to-file'], stdout=blackhole, stderr=STDOUT) # ep.start() processes.append(ep) eps.append(ep) tic = time.time() while len(rc) < base + n: if any([ep.poll() is not None for ep in eps]): raise RuntimeError("A test engine failed to start.") elif time.time() - tic > 10: raise RuntimeError("Timeout waiting for engines to connect.") time.sleep(.1) rc.spin() rc.close() return eps
class parakat(object): """ Uses the ipython clustering for running kat objects in parallel. To use this you must have installed ipyparallel, for example, with: pip install ipyparallel Then yoy must start an ipython cluster on your computer. From a new terminal use the command: ipcluster start -n 4 or: ipcluster start --n=4 This will start a cluster with 4 workers. To run a kat object use: pk = parakat() pk.run(kat1) pk.run(kat2) pk.run(kat3) outs = pk.getResults() The list 'outs' will contain the katRun object you'd normal get if you had just called, kat1.run(), etc. The results list is matched to order in which you run the kats. If you need to stop long running kat processes the chances are you will also need to kill the ipython cluster process, as sometimes they carry on running. """ def __init__(self, **kwargs): self._rc = Client(**kwargs) self._lview = self._rc.load_balanced_view() self._lview.block = False self._results = [] self._run_count = 0 def run(self, kat, func=None, *args, **kwargs): if func is None: func = _run kat_IFO = None if hasattr(kat, 'IFO'): if hasattr(kat.IFO, "_IFO__kat"): kat.IFO._IFO__kat = None # can't pickle stored kat kat_IFO = kat.IFO self._results.append( self._lview.apply_async(func, "".join(kat.generateKatScript()), os.getcwd(), kat_IFO, *args, **kwargs)) if kat_IFO is not None: kat.IFO._IFO__kat = kat self._run_count += 1 def getResults(self): out = [] p = ProgressBar(maxval=self._run_count, widgets=["Parallel jobs: ", Percentage(), Bar()]) while not self._lview.wait(self._results, timeout=0.1): p.update(self._run_count - self._lview.queue_status()['unassigned']) for done in self._results: out.append(done.get()) return out def clear(self): del (self._results) self._results = [] def close(self): self._rc.close()
class EngineManager(object): def __init__(self): self.profile = None self.started_controller = None self.started_engines = set() self._client = None def _select_profile(self): # See IPython.core.profileapp:list_profile_in() profiles = [] for filename in os.listdir(get_ipython_dir()): if filename.startswith('profile_'): profiles.append(filename[8:]) if profiles == ['default'] and not qt_available: self.profile = 'default' elif not qt_available: raise ValueError("'default' IPython profile does not exist " "and PyQt4 is not available") else: self.profile = choose_profile(profiles) def ensure_controller(self, connect_only=False): """Make sure a controller is available, else start a local one. """ if self._client: return self._client if self.profile is None: self._select_profile() if self.profile is None: return None print "parallelflow: using IPython profile %r" % self.profile try: self._client = Client(profile=self.profile) print "parallelflow: connected to controller" return self._client except error.TimeoutError: print "parallelflow: timeout when connecting to controller" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "Unable to connect to the configured IPython " "controller. Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True except IOError: print "parallelflow: didn't find a controller to connect to" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "No controller is configured in this IPython profile. " "Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True if start_ctrl: ctrl_pid = os.path.join( locate_profile(self.profile), 'pid', 'ipcontroller.pid') if os.path.exists(ctrl_pid): os.remove(ctrl_pid) print "parallelflow: starting controller" proc, code = self.start_process( lambda: os.path.exists(ctrl_pid), sys.executable, '-m', 'IPython.parallel.apps.ipcontrollerapp', '--profile=%s' % self.profile) if code is not None: if qt_available: QtGui.QMessageBox.critical( None, "Error", "Controller exited with code %d" % code) print ("parallelflow: controller process exited with " "code %d" % code) return None else: self.started_controller = proc print "parallelflow: controller started, connecting" self._client = Client(profile=self.profile) return self._client return None @staticmethod def start_process(condition, *args): """Executes a file and waits for a condition. """ prev_dir = os.getcwd() os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir)) try: p = subprocess.Popen(args) finally: os.chdir(prev_dir) if condition is None: return p, None else: while True: time.sleep(0.5) if condition(): return p, None res = p.poll() if res is not None: return None, res def start_engines(self, nb=None, prompt="Number of engines to start"): """Start some engines locally """ c = self.ensure_controller() if c is None: if qt_available: QtGui.QMessageBox.warning( None, "No controller", "Can't start engines: couldn't connect to a " "controller") print "parallelflow: no controller, not starting engines" else: if not nb and qt_available: nb, res = QtGui.QInputDialog.getInt( None, "Start engines", prompt, 1, # value 1, # min 16) # max if not res: return elif nb is None: nb = 1 print "parallelflow: about to start %d engines" % nb if qt_available: bar = QtGui.QProgressDialog( "Starting engines...", None, 0, nb) def progress(n): bar.setValue(n) bar.show() else: def progress(n): pass progress(0) init_engines = set(c.ids) # Start the processes starting = set() for i in xrange(nb): proc, res = self.start_process( None, sys.executable, '-m', 'IPython.parallel.apps.ipengineapp', '--profile=%s' % self.profile) starting.add(proc) # Wait for each one to either fail or connect failed = [] connected = 0 while connected < len(starting): connected = len(set(c.ids) - init_engines) progress(len(failed) + connected) time.sleep(0.5) for p in list(starting): res = p.poll() if res is not None: failed.append(res) starting.remove(p) if failed: nb_failed = len(failed) if nb_failed > 3: failed = "%s, ..." % (', '.join('%d' % f for f in failed)) else: failed = ', '.join('%d' % f for f in failed) if qt_available: QtGui.QMessageBox.critical( None, "Error", "%d engine(s) exited with codes: %s" % ( nb_failed, failed)) print "parallelflow: %d engine(s) exited with codes: %s" % ( nb_failed, failed) self.started_engines.update(starting) if qt_available: bar.hide() bar.deleteLater() print "parallelflow: %d engines started" % (i + 1) def info(self): """Show some information on the cluster. """ client = self.ensure_controller(connect_only=True) print "----- IPython information -----" print "profile: %s" % self.profile connected = client is not None print "connected to controller: %s" % ( "yes" if connected else "no") st_ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print "controller started from VisTrails: %s" % ( "running" if st_ctrl else "no") st_engines = sum(1 for p in self.started_engines if p.poll() is None) print "engines started from VisTrails: %d" % st_engines if client is not None: nb_engines = len(client.ids) else: nb_engines = None print "total engines in cluster: %s" % ( nb_engines if nb_engines is not None else "(unknown)") if connected and client.ids: dview = client[:] with dview.sync_imports(): import os import platform import socket engines = dview.apply_async( eval, '(os.getpid(), platform.system(), socket.getfqdn())' ).get_dict() engines = sorted( engines.items(), key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id)) print "engines:" print "\tid\tsystem\tPID\tnode FQDN" print "\t--\t------\t---\t---------" for ip_id, (pid, system, fqdn) in engines: print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn) print "" if qt_available: dialog = QtGui.QDialog() layout = QtGui.QVBoxLayout() form = QtGui.QFormLayout() form.addRow( "Profile:", QtGui.QLabel(self.profile)) form.addRow( "Connected:", QtGui.QLabel("yes" if connected else "no")) form.addRow( "Controller started from VisTrails:", QtGui.QLabel("running" if st_ctrl else "no")) form.addRow( "Engines started from VisTrails:", QtGui.QLabel(str(st_engines))) form.addRow( "Total engines in cluster:", QtGui.QLabel(str(nb_engines) if nb_engines is not None else "(unknown)")) layout.addLayout(form) if connected and client.ids: tree = QtGui.QTreeWidget() tree.setHeaderHidden(False) tree.setHeaderLabels(["IPython id", "PID", "System type"]) engine_tree = dict() for ip_id, (pid, system, fqdn) in engines: engine_tree.setdefault(fqdn, []).append( (ip_id, pid, system)) for fqdn, info in engine_tree.iteritems(): node = QtGui.QTreeWidgetItem([fqdn]) tree.addTopLevelItem(node) tree.setFirstItemColumnSpanned(node, True) for ip_id, pid, system in info: node.addChild(QtGui.QTreeWidgetItem([ str(ip_id), str(pid), system])) for i in xrange(tree.columnCount()): tree.resizeColumnToContents(i) tree.expandAll() layout.addWidget(tree) ok = QtGui.QPushButton("Ok") QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog, QtCore.SLOT('accept()')) layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter) dialog.setLayout(layout) dialog.exec_() def change_profile(self): self.cleanup() old_profile = self.profile self._select_profile() if not self.profile: self.profile = old_profile if self.profile != old_profile: # Here, the processes that were started but the user didn't want to # clean up are abandonned # They will continue running but later cleanups won't ask for these # ones self.started_engines = set() self.started_controller = None def cleanup(self): """Shut down the started processes (with user confirmation). """ engines = sum(1 for p in self.started_engines if p.poll() is None) ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print ("parallelflow: cleanup: %s, %d engines running" % ( "controller running" if ctrl else "no controller", engines)) hub_shutdown = False if ctrl: if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown controller", "The controller is still running. Do you want to stop " "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: if self._client is not None: self._client.shutdown( targets='all', restart=False, hub=True, block=False) hub_shutdown = True print "parallelflow: requested hub shutdown" else: if self.started_controller.poll() is not None: self.started_controller.terminate() self.started_controller.wait() print "parallelflow: controller terminated" self.started_controller = None if engines > 0 and not hub_shutdown: if qt_available: if self._client is not None: total = " (among %d total)" % len(self._client.ids) else: total = '' res = QtGui.QMessageBox.question( None, "Shutdown engines", "%d engines started here%s are still " "running. Do you want to stop them?" % ( engines, total), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: for engine in self.started_engines: if engine.poll() is not None: engine.terminate() engine.wait() print ("parallelflow: %d engines terminated" % len(self.started_engines)) self.started_engines = set() if self._client is not None: print "parallelflow: closing client" self._client.close() self._client = None def shutdown_cluster(self): """Use the client to request a shutdown of the whole cluster. """ client = self.ensure_controller(connect_only=True) if client is None: if qt_available: QtGui.QMessageBox.information( None, "Couldn't connect", "Couldn't connect to a controller. Is the cluster " "down already?") print ("parallelflow: shutdown_cluster requested, but could " "not connect to a controller") return if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown cluster", "This will use the client connection to request the hub " "and every engine to shutdown. Continue?", QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel) if res != QtGui.QMessageBox.Ok: return self._client.shutdown( targets='all', restart=False, hub=True, block=False) print "parallelflow: cluster shutdown requested" self._client = None
class Cluster(object): def __init__(self, **kwargs): self.profile = kwargs.get("profile", "default") self.n = kwargs.get("cores", 1) self.delay = kwargs.get("delay", DEFAULT_DELAY) self.scheduler = kwargs.get("scheduler", "").upper() self.queue = kwargs.get("queue", "hsph") self._client = None self._view = None self._direct_view = None self._work = kwargs.get("work", ".") self._log_level = kwargs.get("log_level", 30) self._cluster_id = str(uuid.uuid1()) def _ipcluster_start_common(self): cmd = [ "ipcluster", "start", "--daemonize=True", "--delay=" + str(self.delay), "--IPClusterEngines.early_shutdown=180", "--log-level=" + str(self._log_level), "--profile=%s" % (self.profile), "--n=%d" % (self.n), "--debug" ] return cmd def _is_scheduler_supported(self): SUPPORTED_SCHEDULERS = ["LSF", "SGE"] return self.scheduler in SUPPORTED_SCHEDULERS def _start_with_scheduler(self): ns = "bcbio.distributed.ipython" engine_class = "Bcbio%sEngineSetLauncher" % self.scheduler controller_class = "Bcbio%sControllerLauncher" % self.scheduler cmd = self._ipcluster_start_common() cmd.extend([ "--IPClusterStart.controller_launcher_class=%s.%s" % (ns, controller_class), "--IPClusterStart.engine_launcher_class=%s.%s" % (ns, engine_class), "--%sLauncher.queue=%s" % (self.scheduler, self.queue) ]) subprocess.check_call(cmd) def _start_with_local(self): cmd = self._ipcluster_start_common() subprocess.check_call(cmd) def start(self): """starts the cluster and connects the client to the controller XXX: in the future, add "--cluster-id=" + self._cluster_id to this, to run each new cluster with a different ID, so we can reuse the same profile. right now there is a bug in ipython that doesn't support this """ if self._is_scheduler_supported(): self._start_with_scheduler() else: self._start_with_local() def client(self): """ returns a handle to the client """ # add cluster_id=self._cluster_id to this call when the bug # is fixed in iPython if not self._client: self._client = Client(profile=self.profile) return self._client return self._client def new_client(self): if self._client: self._client.close() self._client = Client(profile=self.profile) def view(self): """ returns a blocking, load balanced view to the cluster engines """ if self._view: return self._view if not self._client: self._client = Client(profile=self.profile) self._view = self._client.load_balanced_view() self._view.block = True return self._view def direct_view(self): if self._direct_view: return self._direct_view if not self.client: self._client = Client(profile=self.profile) self._direct_view = self._client[:] return self._direct_view def stop(self): parg = "--profile=%s" % (self.profile) # add carg = "--cluster-id=%s" % (self._cluster_id) when # this gets fixed in iPython return_code = subprocess.call(["ipcluster", "stop", parg]) def is_up(self): """ returns True if the cluster is completely up and false otherwise """ try: up = len(self.client().ids) except IOError: logger.info("Waiting for the controller to come up.") return False else: not_up = self.n - up if not_up > 0: logger.info("Waiting for %d engines to come up." % (not_up)) return False else: return True
class EngineManager(object): def __init__(self): self.profile = None self.started_controller = None self.started_engines = set() self._client = None def _select_profile(self): # See IPython.core.profileapp:list_profile_in() profiles = [] for filename in os.listdir(get_ipython_dir()): if filename.startswith('profile_'): profiles.append(filename[8:]) if profiles == ['default'] and not qt_available: self.profile = 'default' elif not qt_available: raise ValueError("'default' IPython profile does not exist " "and PyQt4 is not available") else: self.profile = choose_profile(profiles) def ensure_controller(self, connect_only=False): """Make sure a controller is available, else start a local one. """ if self._client: return self._client if self.profile is None: self._select_profile() if self.profile is None: return None print "parallelflow: using IPython profile %r" % self.profile try: self._client = Client(profile=self.profile) print "parallelflow: connected to controller" return self._client except error.TimeoutError: print "parallelflow: timeout when connecting to controller" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "Unable to connect to the configured IPython " "controller. Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True except IOError: print "parallelflow: didn't find a controller to connect to" if connect_only: start_ctrl = False elif qt_available: res = QtGui.QMessageBox.question( None, "Start controller", "No controller is configured in this IPython profile. " "Do you want to start one?", QtGui.QMessageBox.Yes | QtGui.QMessageBox.No) start_ctrl = res == QtGui.QMessageBox.Yes else: start_ctrl = True if start_ctrl: ctrl_pid = os.path.join(locate_profile(self.profile), 'pid', 'ipcontroller.pid') if os.path.exists(ctrl_pid): os.remove(ctrl_pid) print "parallelflow: starting controller" proc, code = self.start_process( lambda: os.path.exists(ctrl_pid), sys.executable, '-m', 'IPython.parallel.apps.ipcontrollerapp', '--profile=%s' % self.profile) if code is not None: if qt_available: QtGui.QMessageBox.critical( None, "Error", "Controller exited with code %d" % code) print( "parallelflow: controller process exited with " "code %d" % code) return None else: self.started_controller = proc print "parallelflow: controller started, connecting" self._client = Client(profile=self.profile) return self._client return None @staticmethod def start_process(condition, *args): """Executes a file and waits for a condition. """ prev_dir = os.getcwd() os.chdir(os.path.join(vistrails_root_directory(), os.path.pardir)) try: p = subprocess.Popen(args) finally: os.chdir(prev_dir) if condition is None: return p, None else: while True: time.sleep(0.5) if condition(): return p, None res = p.poll() if res is not None: return None, res def start_engines(self, nb=None, prompt="Number of engines to start"): """Start some engines locally """ c = self.ensure_controller() if c is None: if qt_available: QtGui.QMessageBox.warning( None, "No controller", "Can't start engines: couldn't connect to a " "controller") print "parallelflow: no controller, not starting engines" else: if not nb and qt_available: nb, res = QtGui.QInputDialog.getInt( None, "Start engines", prompt, 1, # value 1, # min 16) # max if not res: return elif nb is None: nb = 1 print "parallelflow: about to start %d engines" % nb if qt_available: bar = QtGui.QProgressDialog("Starting engines...", None, 0, nb) def progress(n): bar.setValue(n) bar.show() else: def progress(n): pass progress(0) init_engines = set(c.ids) # Start the processes starting = set() for i in xrange(nb): proc, res = self.start_process( None, sys.executable, '-m', 'IPython.parallel.apps.ipengineapp', '--profile=%s' % self.profile) starting.add(proc) # Wait for each one to either fail or connect failed = [] connected = 0 while connected < len(starting): connected = len(set(c.ids) - init_engines) progress(len(failed) + connected) time.sleep(0.5) for p in list(starting): res = p.poll() if res is not None: failed.append(res) starting.remove(p) if failed: nb_failed = len(failed) if nb_failed > 3: failed = "%s, ..." % (', '.join('%d' % f for f in failed)) else: failed = ', '.join('%d' % f for f in failed) if qt_available: QtGui.QMessageBox.critical( None, "Error", "%d engine(s) exited with codes: %s" % (nb_failed, failed)) print "parallelflow: %d engine(s) exited with codes: %s" % ( nb_failed, failed) self.started_engines.update(starting) if qt_available: bar.hide() bar.deleteLater() print "parallelflow: %d engines started" % nb def info(self): """Show some information on the cluster. """ client = self.ensure_controller(connect_only=True) print "----- IPython information -----" print "profile: %s" % self.profile connected = client is not None print "connected to controller: %s" % ("yes" if connected else "no") st_ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print "controller started from VisTrails: %s" % ("running" if st_ctrl else "no") st_engines = sum(1 for p in self.started_engines if p.poll() is None) print "engines started from VisTrails: %d" % st_engines if client is not None: nb_engines = len(client.ids) else: nb_engines = None print "total engines in cluster: %s" % (nb_engines if nb_engines is not None else "(unknown)") if connected and client.ids: dview = client[:] with dview.sync_imports(): import os import platform import socket engines = dview.apply_async( eval, '(os.getpid(), platform.system(), socket.getfqdn())').get_dict( ) engines = sorted(engines.items(), key=lambda (ip_id, (pid, system, fqdn)): (fqdn, ip_id)) print "engines:" print "\tid\tsystem\tPID\tnode FQDN" print "\t--\t------\t---\t---------" for ip_id, (pid, system, fqdn) in engines: print "\t%d\t%s\t%d\t%s" % (ip_id, system, pid, fqdn) print "" if qt_available: dialog = QtGui.QDialog() layout = QtGui.QVBoxLayout() form = QtGui.QFormLayout() form.addRow("Profile:", QtGui.QLabel(self.profile)) form.addRow("Connected:", QtGui.QLabel("yes" if connected else "no")) form.addRow("Controller started from VisTrails:", QtGui.QLabel("running" if st_ctrl else "no")) form.addRow("Engines started from VisTrails:", QtGui.QLabel(str(st_engines))) form.addRow( "Total engines in cluster:", QtGui.QLabel( str(nb_engines) if nb_engines is not None else "(unknown)") ) layout.addLayout(form) if connected and client.ids: tree = QtGui.QTreeWidget() tree.setHeaderHidden(False) tree.setHeaderLabels(["IPython id", "PID", "System type"]) engine_tree = dict() for ip_id, (pid, system, fqdn) in engines: engine_tree.setdefault(fqdn, []).append( (ip_id, pid, system)) for fqdn, info in engine_tree.iteritems(): node = QtGui.QTreeWidgetItem([fqdn]) tree.addTopLevelItem(node) tree.setFirstItemColumnSpanned(node, True) for ip_id, pid, system in info: node.addChild( QtGui.QTreeWidgetItem( [str(ip_id), str(pid), system])) for i in xrange(tree.columnCount()): tree.resizeColumnToContents(i) tree.expandAll() layout.addWidget(tree) ok = QtGui.QPushButton("Ok") QtCore.QObject.connect(ok, QtCore.SIGNAL('clicked()'), dialog, QtCore.SLOT('accept()')) layout.addWidget(ok, 1, QtCore.Qt.AlignHCenter) dialog.setLayout(layout) dialog.exec_() def change_profile(self): self.cleanup() old_profile = self.profile self._select_profile() if not self.profile: self.profile = old_profile if self.profile != old_profile: # Here, the processes that were started but the user didn't want to # clean up are abandonned # They will continue running but later cleanups won't ask for these # ones self.started_engines = set() self.started_controller = None def cleanup(self): """Shut down the started processes (with user confirmation). """ engines = sum(1 for p in self.started_engines if p.poll() is None) ctrl = (self.started_controller is not None and self.started_controller.poll() is None) print("parallelflow: cleanup: %s, %d engines running" % ("controller running" if ctrl else "no controller", engines)) hub_shutdown = False if ctrl: if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown controller", "The controller is still running. Do you want to stop " "it?", QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: if self._client is not None: self._client.shutdown(targets='all', restart=False, hub=True, block=False) hub_shutdown = True print "parallelflow: requested hub shutdown" else: if self.started_controller.poll() is not None: self.started_controller.terminate() self.started_controller.wait() print "parallelflow: controller terminated" self.started_controller = None if engines > 0 and not hub_shutdown: if qt_available: if self._client is not None: total = " (among %d total)" % len(self._client.ids) else: total = '' res = QtGui.QMessageBox.question( None, "Shutdown engines", "%d engines started here%s are still " "running. Do you want to stop them?" % (engines, total), QtGui.QMessageBox.Yes, QtGui.QMessageBox.No) res = res != QtGui.QMessageBox.No else: res = True if res: for engine in self.started_engines: if engine.poll() is not None: engine.terminate() engine.wait() print("parallelflow: %d engines terminated" % len(self.started_engines)) self.started_engines = set() if self._client is not None: print "parallelflow: closing client" self._client.close() self._client = None def shutdown_cluster(self): """Use the client to request a shutdown of the whole cluster. """ client = self.ensure_controller(connect_only=True) if client is None: if qt_available: QtGui.QMessageBox.information( None, "Couldn't connect", "Couldn't connect to a controller. Is the cluster " "down already?") print( "parallelflow: shutdown_cluster requested, but could " "not connect to a controller") return if qt_available: res = QtGui.QMessageBox.question( None, "Shutdown cluster", "This will use the client connection to request the hub " "and every engine to shutdown. Continue?", QtGui.QMessageBox.Ok, QtGui.QMessageBox.Cancel) if res != QtGui.QMessageBox.Ok: return self._client.shutdown(targets='all', restart=False, hub=True, block=False) print "parallelflow: cluster shutdown requested" self._client = None
class Cluster(object): def __init__(self, **kwargs): self.profile = kwargs.get("profile", "default") self.n = kwargs.get("cores", 1) self.delay = kwargs.get("delay", DEFAULT_DELAY) self.scheduler = kwargs.get("scheduler", "").upper() self.queue = kwargs.get("queue", "hsph") self._client = None self._view = None self._direct_view = None self._work = kwargs.get("work", ".") self._log_level = kwargs.get("log_level", 30) self._cluster_id = str(uuid.uuid1()) def _ipcluster_start_common(self): cmd = [ "ipcluster", "start", "--daemonize=True", "--delay=" + str(self.delay), "--IPClusterEngines.early_shutdown=180", "--log-level=" + str(self._log_level), "--profile=%s" % (self.profile), "--n=%d" % (self.n), "--debug", ] return cmd def _is_scheduler_supported(self): SUPPORTED_SCHEDULERS = ["LSF", "SGE"] return self.scheduler in SUPPORTED_SCHEDULERS def _start_with_scheduler(self): ns = "bcbio.distributed.ipython" engine_class = "Bcbio%sEngineSetLauncher" % self.scheduler controller_class = "Bcbio%sControllerLauncher" % self.scheduler cmd = self._ipcluster_start_common() cmd.extend( [ "--IPClusterStart.controller_launcher_class=%s.%s" % (ns, controller_class), "--IPClusterStart.engine_launcher_class=%s.%s" % (ns, engine_class), "--%sLauncher.queue=%s" % (self.scheduler, self.queue), ] ) subprocess.check_call(cmd) def _start_with_local(self): cmd = self._ipcluster_start_common() subprocess.check_call(cmd) def start(self): """starts the cluster and connects the client to the controller XXX: in the future, add "--cluster-id=" + self._cluster_id to this, to run each new cluster with a different ID, so we can reuse the same profile. right now there is a bug in ipython that doesn't support this """ if self._is_scheduler_supported(): self._start_with_scheduler() else: self._start_with_local() def client(self): """ returns a handle to the client """ # add cluster_id=self._cluster_id to this call when the bug # is fixed in iPython if not self._client: self._client = Client(profile=self.profile) return self._client return self._client def new_client(self): if self._client: self._client.close() self._client = Client(profile=self.profile) def view(self): """ returns a blocking, load balanced view to the cluster engines """ if self._view: return self._view if not self._client: self._client = Client(profile=self.profile) self._view = self._client.load_balanced_view() self._view.block = True return self._view def direct_view(self): if self._direct_view: return self._direct_view if not self.client: self._client = Client(profile=self.profile) self._direct_view = self._client[:] return self._direct_view def stop(self): parg = "--profile=%s" % (self.profile) # add carg = "--cluster-id=%s" % (self._cluster_id) when # this gets fixed in iPython return_code = subprocess.call(["ipcluster", "stop", parg]) def is_up(self): """ returns True if the cluster is completely up and false otherwise """ try: up = len(self.client().ids) except IOError: logger.info("Waiting for the controller to come up.") return False else: not_up = self.n - up if not_up > 0: logger.info("Waiting for %d engines to come up." % (not_up)) return False else: return True