Example #1
0
def clusterRemote(opt, arg):
    """Start a remote cluster over SSH"""

    # Load the remote cluster configuration
    clConfig = {}
    execfile(opt.clusterfile, clConfig)
    contConfig = clConfig['controller']
    engConfig = clConfig['engines']
    # Determine where to find sshx:
    sshx = clConfig.get('sshx', os.environ.get('IPYTHON_SSHX', 'sshx'))

    #ADDED CONFIG ITEMS
    sshOpts = clConfig['ssh_options_string']
    pushConfig = clConfig['push_kwargs']

    # Store all logs inside the ipython directory
    ipdir = cutils.get_ipython_dir()
    pjoin = os.path.join

    logfile = opt.logfile
    if logfile is None:
        logdir_base = pjoin(ipdir, 'log')
        ensureDir(logdir_base)
        logfile = pjoin(logdir_base, 'ipcluster')

    # Append this script's PID to the logfile name always
    logfile = '%s-%s' % (logfile, os.getpid())

    print 'Starting controller:'
    # Controller data:
    xsys = os.system

    contHost = contConfig['host']
    contLog = '%s-con-%s-' % (logfile, contHost)

    cmd = "ssh %s %s '%s' 'rm ~/.ipython/*.furl ~/.ipython/*.pem'" % \
          (sshOpts,contHost,sshx)
    print 'cmd:<%s>' % cmd  # dbg
    xsys(cmd)
    time.sleep(1)

    cmd = "ssh %s %s '%s' 'ipcontroller --logfile %s' &" % \
          (sshOpts,contHost,sshx,contLog)
    print 'cmd:<%s>' % cmd  # dbg
    xsys(cmd)
    time.sleep(2)

    import AWS

    #check for mpi - copied from above
    mpi = opt.mpi
    mpistr = ''
    if mpi:  # start with mpi - killing the engines with sigterm will not work if you do this
        mpistr = '--mpi=' + mpi

    print 'Starting engines:   '
    for engineHost, engineData in engConfig.iteritems():

        if isinstance(engineData, int):
            numEngines = engineData
        else:
            raise NotImplementedError(
                'port configuration not finished for engines')

        print 'Pushing furl to %s' % engineHost
        AWS.push_engine_furl(engineHost, **pushConfig)

        print 'Starting %d engines on %s' % (numEngines, engineHost)
        engLog = '%s-eng-%s-' % (logfile, engineHost)
        for i in range(numEngines):
            #cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \ (engineHost,sshx,contHost,engLog)
            cmd = "ssh %s %s '%s' 'ipengine %s --logfile %s' &" % (
                sshOpts, engineHost, sshx, mpistr, engLog)

            print 'cmd:<%s>' % cmd  # dbg
            xsys(cmd)
        # Wait after each host a little bit
        time.sleep(1)

    startMsg(contConfig['host'])
Example #2
0
def clusterRemote(opt, arg):
    """Start a remote cluster over SSH"""

    # Load the remote cluster configuration
    clConfig = {}
    execfile(opt.clusterfile, clConfig)
    contConfig = clConfig["controller"]
    engConfig = clConfig["engines"]
    # Determine where to find sshx:
    sshx = clConfig.get("sshx", os.environ.get("IPYTHON_SSHX", "sshx"))

    # ADDED CONFIG ITEMS
    sshOpts = clConfig["ssh_options_string"]
    pushConfig = clConfig["push_kwargs"]

    # Store all logs inside the ipython directory
    ipdir = cutils.get_ipython_dir()
    pjoin = os.path.join

    logfile = opt.logfile
    if logfile is None:
        logdir_base = pjoin(ipdir, "log")
        ensureDir(logdir_base)
        logfile = pjoin(logdir_base, "ipcluster")

    # Append this script's PID to the logfile name always
    logfile = "%s-%s" % (logfile, os.getpid())

    print "Starting controller:"
    # Controller data:
    xsys = os.system

    contHost = contConfig["host"]
    contLog = "%s-con-%s-" % (logfile, contHost)

    cmd = "ssh %s %s '%s' 'rm ~/.ipython/*.furl ~/.ipython/*.pem'" % (sshOpts, contHost, sshx)
    print "cmd:<%s>" % cmd  # dbg
    xsys(cmd)
    time.sleep(1)

    cmd = "ssh %s %s '%s' 'ipcontroller --logfile %s' &" % (sshOpts, contHost, sshx, contLog)
    print "cmd:<%s>" % cmd  # dbg
    xsys(cmd)
    time.sleep(2)

    import AWS

    # check for mpi - copied from above
    mpi = opt.mpi
    mpistr = ""
    if mpi:  # start with mpi - killing the engines with sigterm will not work if you do this
        mpistr = "--mpi=" + mpi

    print "Starting engines:   "
    for engineHost, engineData in engConfig.iteritems():

        if isinstance(engineData, int):
            numEngines = engineData
        else:
            raise NotImplementedError("port configuration not finished for engines")

        print "Pushing furl to %s" % engineHost
        AWS.push_engine_furl(engineHost, **pushConfig)

        print "Starting %d engines on %s" % (numEngines, engineHost)
        engLog = "%s-eng-%s-" % (logfile, engineHost)
        for i in range(numEngines):
            # cmd = "ssh %s '%s' 'ipengine --controller-ip %s --logfile %s' &" % \ (engineHost,sshx,contHost,engLog)
            cmd = "ssh %s %s '%s' 'ipengine %s --logfile %s' &" % (sshOpts, engineHost, sshx, mpistr, engLog)

            print "cmd:<%s>" % cmd  # dbg
            xsys(cmd)
        # Wait after each host a little bit
        time.sleep(1)

    startMsg(contConfig["host"])
Example #3
0
def clusterLocal(opt, arg):
    """Start a cluster on the local machine."""

    # Store all logs inside the ipython directory
    ipdir = cutils.get_ipython_dir()
    pjoin = os.path.join

    logfile = opt.logfile
    if logfile is None:
        logdir_base = pjoin(ipdir, 'log')
        ensureDir(logdir_base)
        logfile = pjoin(logdir_base, 'ipcluster-')

    print 'Starting controller:',
    controller = Popen(['ipcontroller', '--logfile', logfile, '-x', '-y'])
    print 'Controller PID:', controller.pid

    print 'Starting engines:   ',
    time.sleep(5)

    englogfile = '%s%s-' % (logfile, controller.pid)
    mpi = opt.mpi
    if mpi:  # start with mpi - killing the engines with sigterm will not work if you do this
        engines = [
            Popen([
                'mpirun', '-np',
                str(opt.n), 'ipengine', '--mpi', mpi, '--logfile', englogfile
            ])
        ]
        # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])]
    else:  # do what we would normally do
        engines = [
            Popen(['ipengine', '--logfile', englogfile]) for i in range(opt.n)
        ]
    eids = [e.pid for e in engines]
    print 'Engines PIDs:  ', eids
    print 'Log files: %s*' % englogfile

    proc_ids = eids + [controller.pid]
    procs = engines + [controller]

    grpid = os.getpgrp()
    try:
        startMsg('127.0.0.1')
        print 'You can also hit Ctrl-C to stop it, or use from the cmd line:'
        print
        print 'kill -INT', grpid
        print
        try:
            while True:
                time.sleep(5)
        except:
            pass
    finally:
        print 'Stopping cluster.  Cleaning up...'
        cleanup(stop, controller, engines)
        for i in range(4):
            time.sleep(i + 2)
            nZombies = numAlive(controller, engines)
            if nZombies == 0:
                print 'OK: All processes cleaned up.'
                break
            print 'Trying again, %d processes did not stop...' % nZombies
            cleanup(kill, controller, engines)
            if numAlive(controller, engines) == 0:
                print 'OK: All processes cleaned up.'
                break
        else:
            print '*' * 75
            print 'ERROR: could not kill some processes, try to do it',
            print 'manually.'
            zombies = []
            if controller.returncode is None:
                print 'Controller is alive: pid =', controller.pid
                zombies.append(controller.pid)
            liveEngines = [e for e in engines if e.returncode is None]
            for e in liveEngines:
                print 'Engine is alive:     pid =', e.pid
                zombies.append(e.pid)
            print
            print 'Zombie summary:', ' '.join(map(str, zombies))
Example #4
0
def clusterLocal(opt, arg):
    """Start a cluster on the local machine."""

    # Store all logs inside the ipython directory
    ipdir = cutils.get_ipython_dir()
    pjoin = os.path.join

    logfile = opt.logfile
    if logfile is None:
        logdir_base = pjoin(ipdir, "log")
        ensureDir(logdir_base)
        logfile = pjoin(logdir_base, "ipcluster-")

    print "Starting controller:",
    controller = Popen(["ipcontroller", "--logfile", logfile, "-x", "-y"])
    print "Controller PID:", controller.pid

    print "Starting engines:   ",
    time.sleep(5)

    englogfile = "%s%s-" % (logfile, controller.pid)
    mpi = opt.mpi
    if mpi:  # start with mpi - killing the engines with sigterm will not work if you do this
        engines = [Popen(["mpirun", "-np", str(opt.n), "ipengine", "--mpi", mpi, "--logfile", englogfile])]
        # engines = [Popen(['mpirun', '-np', str(opt.n), 'ipengine', '--mpi', mpi])]
    else:  # do what we would normally do
        engines = [Popen(["ipengine", "--logfile", englogfile]) for i in range(opt.n)]
    eids = [e.pid for e in engines]
    print "Engines PIDs:  ", eids
    print "Log files: %s*" % englogfile

    proc_ids = eids + [controller.pid]
    procs = engines + [controller]

    grpid = os.getpgrp()
    try:
        startMsg("127.0.0.1")
        print "You can also hit Ctrl-C to stop it, or use from the cmd line:"
        print
        print "kill -INT", grpid
        print
        try:
            while True:
                time.sleep(5)
        except:
            pass
    finally:
        print "Stopping cluster.  Cleaning up..."
        cleanup(stop, controller, engines)
        for i in range(4):
            time.sleep(i + 2)
            nZombies = numAlive(controller, engines)
            if nZombies == 0:
                print "OK: All processes cleaned up."
                break
            print "Trying again, %d processes did not stop..." % nZombies
            cleanup(kill, controller, engines)
            if numAlive(controller, engines) == 0:
                print "OK: All processes cleaned up."
                break
        else:
            print "*" * 75
            print "ERROR: could not kill some processes, try to do it",
            print "manually."
            zombies = []
            if controller.returncode is None:
                print "Controller is alive: pid =", controller.pid
                zombies.append(controller.pid)
            liveEngines = [e for e in engines if e.returncode is None]
            for e in liveEngines:
                print "Engine is alive:     pid =", e.pid
                zombies.append(e.pid)
            print
            print "Zombie summary:", " ".join(map(str, zombies))