def clusterRemote(opt, arg): """Start a remote cluster over SSH""" # Load the remote cluster configuration clConfig = {} execfile(opt.clusterfile, clConfig) contConfig = clConfig['controller'] engConfig = clConfig['engines'] # Determine where to find sshx: sshx = clConfig.get('sshx', os.environ.get('IPYTHON_SSHX', 'sshx')) #ADDED CONFIG ITEMS sshOpts = clConfig['ssh_options_string'] pushConfig = clConfig['push_kwargs'] # Store all logs inside the ipython directory ipdir = cutils.get_ipython_dir() pjoin = os.path.join logfile = opt.logfile if logfile is None: logdir_base = pjoin(ipdir, 'log') ensureDir(logdir_base) logfile = pjoin(logdir_base, 'ipcluster') # Append this script's PID to the logfile name always logfile = '%s-%s' % (logfile, os.getpid()) print 'Starting controller:' # Controller data: xsys = os.system contHost = contConfig['host'] contLog = '%s-con-%s-' % (logfile, contHost) cmd = "ssh %s %s '%s' 'rm ~/.ipython/*.furl ~/.ipython/*.pem'" % \ (sshOpts,contHost,sshx) print 'cmd:<%s>' % cmd # dbg xsys(cmd) time.sleep(1) cmd = "ssh %s %s '%s' 'ipcontroller --logfile %s' &" % \ (sshOpts,contHost,sshx,contLog) print 'cmd:<%s>' % cmd # dbg xsys(cmd) time.sleep(2) import AWS #check for mpi - copied from above mpi = opt.mpi mpistr = '' if mpi: # start with mpi - killing the engines with sigterm will not work if you do this mpistr = '--mpi=' + mpi print 'Starting engines: ' for engineHost, engineData in engConfig.iteritems(): if isinstance(engineData, int): numEngines = engineData else: raise NotImplementedError( 'port configuration not finished for engines') print 'Pushing furl to %s' % engineHost AWS.push_engine_furl(engineHost, **pushConfig) print 'Starting %d engines on %s' % (numEngines, engineHost) engLog = '%s-eng-%s-' % (logfile, engineHost) for i in range(numEngines): #cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \ (engineHost,sshx,contHost,engLog) cmd = "ssh %s %s '%s' 'ipengine %s --logfile %s' &" % ( sshOpts, engineHost, sshx, mpistr, engLog) print 'cmd:<%s>' % cmd # dbg xsys(cmd) # Wait after each host a little bit time.sleep(1) startMsg(contConfig['host'])
def clusterRemote(opt, arg): """Start a remote cluster over SSH""" # Load the remote cluster configuration clConfig = {} execfile(opt.clusterfile, clConfig) contConfig = clConfig["controller"] engConfig = clConfig["engines"] # Determine where to find sshx: sshx = clConfig.get("sshx", os.environ.get("IPYTHON_SSHX", "sshx")) # ADDED CONFIG ITEMS sshOpts = clConfig["ssh_options_string"] pushConfig = clConfig["push_kwargs"] # Store all logs inside the ipython directory ipdir = cutils.get_ipython_dir() pjoin = os.path.join logfile = opt.logfile if logfile is None: logdir_base = pjoin(ipdir, "log") ensureDir(logdir_base) logfile = pjoin(logdir_base, "ipcluster") # Append this script's PID to the logfile name always logfile = "%s-%s" % (logfile, os.getpid()) print "Starting controller:" # Controller data: xsys = os.system contHost = contConfig["host"] contLog = "%s-con-%s-" % (logfile, contHost) cmd = "ssh %s %s '%s' 'rm ~/.ipython/*.furl ~/.ipython/*.pem'" % (sshOpts, contHost, sshx) print "cmd:<%s>" % cmd # dbg xsys(cmd) time.sleep(1) cmd = "ssh %s %s '%s' 'ipcontroller --logfile %s' &" % (sshOpts, contHost, sshx, contLog) print "cmd:<%s>" % cmd # dbg xsys(cmd) time.sleep(2) import AWS # check for mpi - copied from above mpi = opt.mpi mpistr = "" if mpi: # start with mpi - killing the engines with sigterm will not work if you do this mpistr = "--mpi=" + mpi print "Starting engines: " for engineHost, engineData in engConfig.iteritems(): if isinstance(engineData, int): numEngines = engineData else: raise NotImplementedError("port configuration not finished for engines") print "Pushing furl to %s" % engineHost AWS.push_engine_furl(engineHost, **pushConfig) print "Starting %d engines on %s" % (numEngines, engineHost) engLog = "%s-eng-%s-" % (logfile, engineHost) for i in range(numEngines): # cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \ (engineHost,sshx,contHost,engLog) cmd = "ssh %s %s '%s' 'ipengine %s --logfile %s' &" % (sshOpts, engineHost, sshx, mpistr, engLog) print "cmd:<%s>" % cmd # dbg xsys(cmd) # Wait after each host a little bit time.sleep(1) startMsg(contConfig["host"])
def clusterLocal(opt, arg): """Start a cluster on the local machine.""" # Store all logs inside the ipython directory ipdir = cutils.get_ipython_dir() pjoin = os.path.join logfile = opt.logfile if logfile is None: logdir_base = pjoin(ipdir, 'log') ensureDir(logdir_base) logfile = pjoin(logdir_base, 'ipcluster-') print 'Starting controller:', controller = Popen(['ipcontroller', '--logfile', logfile, '-x', '-y']) print 'Controller PID:', controller.pid print 'Starting engines: ', time.sleep(5) englogfile = '%s%s-' % (logfile, controller.pid) mpi = opt.mpi if mpi: # start with mpi - killing the engines with sigterm will not work if you do this engines = [ Popen([ 'mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi, '--logfile', englogfile ]) ] # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])] else: # do what we would normally do engines = [ Popen(['ipengine', '--logfile', englogfile]) for i in range(opt.n) ] eids = [e.pid for e in engines] print 'Engines PIDs: ', eids print 'Log files: %s*' % englogfile proc_ids = eids + [controller.pid] procs = engines + [controller] grpid = os.getpgrp() try: startMsg('127.0.0.1') print 'You can also hit Ctrl-C to stop it, or use from the cmd line:' print print 'kill -INT', grpid print try: while True: time.sleep(5) except: pass finally: print 'Stopping cluster. Cleaning up...' cleanup(stop, controller, engines) for i in range(4): time.sleep(i + 2) nZombies = numAlive(controller, engines) if nZombies == 0: print 'OK: All processes cleaned up.' break print 'Trying again, %d processes did not stop...' % nZombies cleanup(kill, controller, engines) if numAlive(controller, engines) == 0: print 'OK: All processes cleaned up.' break else: print '*' * 75 print 'ERROR: could not kill some processes, try to do it', print 'manually.' zombies = [] if controller.returncode is None: print 'Controller is alive: pid =', controller.pid zombies.append(controller.pid) liveEngines = [e for e in engines if e.returncode is None] for e in liveEngines: print 'Engine is alive: pid =', e.pid zombies.append(e.pid) print print 'Zombie summary:', ' '.join(map(str, zombies))
def clusterLocal(opt, arg): """Start a cluster on the local machine.""" # Store all logs inside the ipython directory ipdir = cutils.get_ipython_dir() pjoin = os.path.join logfile = opt.logfile if logfile is None: logdir_base = pjoin(ipdir, "log") ensureDir(logdir_base) logfile = pjoin(logdir_base, "ipcluster-") print "Starting controller:", controller = Popen(["ipcontroller", "--logfile", logfile, "-x", "-y"]) print "Controller PID:", controller.pid print "Starting engines: ", time.sleep(5) englogfile = "%s%s-" % (logfile, controller.pid) mpi = opt.mpi if mpi: # start with mpi - killing the engines with sigterm will not work if you do this engines = [Popen(["mpirun", "-np", str(opt.n), "ipengine", "--mpi", mpi, "--logfile", englogfile])] # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])] else: # do what we would normally do engines = [Popen(["ipengine", "--logfile", englogfile]) for i in range(opt.n)] eids = [e.pid for e in engines] print "Engines PIDs: ", eids print "Log files: %s*" % englogfile proc_ids = eids + [controller.pid] procs = engines + [controller] grpid = os.getpgrp() try: startMsg("127.0.0.1") print "You can also hit Ctrl-C to stop it, or use from the cmd line:" print print "kill -INT", grpid print try: while True: time.sleep(5) except: pass finally: print "Stopping cluster. Cleaning up..." cleanup(stop, controller, engines) for i in range(4): time.sleep(i + 2) nZombies = numAlive(controller, engines) if nZombies == 0: print "OK: All processes cleaned up." break print "Trying again, %d processes did not stop..." % nZombies cleanup(kill, controller, engines) if numAlive(controller, engines) == 0: print "OK: All processes cleaned up." break else: print "*" * 75 print "ERROR: could not kill some processes, try to do it", print "manually." zombies = [] if controller.returncode is None: print "Controller is alive: pid =", controller.pid zombies.append(controller.pid) liveEngines = [e for e in engines if e.returncode is None] for e in liveEngines: print "Engine is alive: pid =", e.pid zombies.append(e.pid) print print "Zombie summary:", " ".join(map(str, zombies))