예제 #1
0
파일: lcloud.py 프로젝트: danx0r/burst
def get_server_state(srv):
    nodes = config.driver.list_nodes()  #need to refresh node to get state
    node = [x for x in nodes if x.uuid.find(srv.uuid) == 0]
    if node:
        return node[0].state
    vprint("Cannot find server to determine state; assuming terminated")
    return 'terminated'
예제 #2
0
파일: lcloud.py 프로젝트: danx0r/burst
def terminate_server(srv):
    result = config.driver.destroy_node(srv)
    if not result:
        return "error terminating server"
    state = None
    while state != 'terminated':
        state = get_server_state(srv)
        time.sleep(2)
        vprint("server state:", state)
    return "success"
예제 #3
0
파일: lcloud.py 프로젝트: danx0r/burst
def stop_server(srv):
    result = srv.stop_node()
    if not result:
        return "error stopping server"
    state = None
    while state != 'stopped':
        state = get_server_state(srv)
        time.sleep(2)
        vprint("server state:", state)
    return "success"
예제 #4
0
파일: lcloud.py 프로젝트: danx0r/burst
def start_server(srv):
    result = srv.start()
    if not result:
        return "error starting server"
    state = None
    while state != 'running':
        state = get_server_state(srv)
        time.sleep(2)
        vprint("server state:", state)
    vprint("Waiting for public IP address to be assigned")
    config.driver.wait_until_running([srv])
    vprint("Public IPs:", srv.public_ips)
    while len(srv.public_ips) == 0 or srv.public_ips.count(None) == len(
            srv.public_ips):  #Really? Google? [None]????
        # srv = config.driver.list_nodes(ex_node_ids=[srv.id])[0]
        srv = get_server(
            uuid=srv.uuid)  #seems necessary to refresh to update state
        vprint("Public IPs:", srv.public_ips)
        time.sleep(5)
    return srv
예제 #5
0
def burst(args,
          sshuser=None,
          url=None,
          uuid=None,
          burst_user=None,
          gpu=False,
          ports=None,
          stop=False,
          image=None,
          vmtype=None,
          pubkey=None,
          dockerfile="Dockerfile",
          cloudmap="",
          dockerdport=2376,
          bgd=False,
          sync_only=False,
          conf=None):
    error = None
    tunnel = None
    try:
        if not os.path.exists(dockerfile):
            raise Exception("Dockerfile not found")
        if not os.path.exists(".dockerignore"):
            raise Exception("""

.dockerignore file not found. Burst requires a .dockerignore to avoid sending excess data to docker build.
Because the working directory is rsync'd to the remote host, you typically only need to send the Dockerfile
and files that are referred to (such as requirements.txt) to the build daemon.

#Template .dockerignore
#Default to ignoring everything:
**
#exceptions (These will be sent to the docker daemon for building):
!/Dockerfile*
!requirements.txt
""")

        if not os.path.exists(".burstignore"):
            raise Exception("""

.burstignore file not found. Burst requires a .burstignore to avoid synchronizing irrelevant data (such as
hidden files) with the remote server. Here is a template, copy this to .burstignore in your project directory:

.*
venv
__pycache__
""")

        #if url specified, split into user & IP
        if url:
            if not sshuser:
                sshuser, url = url.split('@')

        #launch, restart, or reconnect to node
        node = None

        #unless running --local:
        if url or uuid or burst_user:

            #if server does not exist, launch a fresh one
            fresh = False
            restart = False
            node = get_server(url=url, uuid=uuid, name=burst_user, conf=conf)
            if burst_user and not node:
                # print ("PUBKEY:", pubkey)
                node = launch_server(burst_user,
                                     pubkey=pubkey,
                                     vmtype=vmtype,
                                     image=image,
                                     conf=conf,
                                     user=sshuser,
                                     gpu=gpu)
                fresh = True
                restart = True
            if node:

                #if stopped, restart
                if node.state.lower() != "running":
                    restart = True
                    vprint("Starting server")
                    node = start_server(node)

                #by now we must have a public IP address
                url = node.public_ips[0]

                #wait for ssh daemon to be ready
                vprint("Waiting for sshd")
                cmd = [
                    "ssh", "-o StrictHostKeyChecking=no",
                    "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error",
                    "{0}@{1}".format(sshuser, url), "echo", "'sshd responding'"
                ]
                vvprint(cmd)
                good = False
                for z in range(10, -1, -1):
                    ret = run(cmd, timeout=15)
                    if ret[0].strip()[-15:] == 'sshd responding':
                        good = True
                        break
                    vprint(
                        "still waiting on sshd (this can take a while) -- will try %d more times"
                        % z)
                    if z:
                        time.sleep(5)
                if not good:
                    raise Exception("error in ssh call: %s" % ret[0].strip())
                vvprint("SSH returns -->%s|%s<--" % ret)
            else:
                raise Exception("Error: node not found")

        docker_port_args = ""

        #we have a url unless running --local:
        if url:

            #if just launched, install docker
            if fresh:
                vprint("Configuring Docker")
                # 'sudo apt-get -y update; sudo apt-get -y install docker.io; ' \ #images have docker installed
                cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \
                      '"sudo usermod -a -G docker ubuntu; ' \
                      'sudo systemctl unmask docker; sudo service docker start"'.format(sshuser, url)
                vvprint(cmd)
                os.system(cmd)

            vprint("Connecting through ssh")
            tunnel, docker_port_args = ssh_tunnel(url, sshuser, ports,
                                                  dockerdport)

            #path = absolute working directory on host
            relpath = os.path.abspath('.')[len(os.path.expanduser('~')):]
            relpath = "/_BURST" + relpath.replace('/', '_')  #I can exlain
            locpath = os.path.abspath('.')
            path = "/home/{0}{1}".format(sshuser, relpath)

            if not sync_only:
                # part of check to see if docker is installed and running
                remote = "-H localhost:%s" % dockerdport
                cmd = [
                    "docker", "{0}".format(remote), "ps", "--format",
                    '{{json .}}'
                ]
                vvprint(cmd)
                out, err = run(cmd)
                vvprint("PS returns:", out)
                running = len([x for x in out.strip().split("\n") if x])
                if running:
                    raise Exception(
                        "docker process already running -- burst does not support multiple processes"
                    )

                #prepare to build docker container
                vprint("Removing topmost layer")  #to avoid running stale image
                cmd = [
                    "docker", "{0}".format(remote), "rmi", "--no-prune",
                    DEFAULT_IMAGE
                ]
                vvprint(cmd)
                out, err = run(cmd)
                if "no such image" in out.lower():
                    out = "Creating new burst_image"
                vvprint(out)

            vmtype, image = fix_vmtype_and_image(vmtype, image)
            if vmtype and vmtype != get_server_vmtype(node):  #FIXME
                raise Exception(
                    "Cannot change vmtype (instance type) or gpu status -- need to re-launch"
                )

            # get_server_image is broken, need to prompt better here
            # if image and image != get_server_image(node):
            # if image and image != get_server_image(node):
            #     raise Exception("FIXME: cannot change host image -- need to terminate & re-launch server")

            vprint("burst: name %s vmtype %s image %s url %s" %
                   (node.name, vmtype, image, url))

            #if using cloud storage (s3 etc), set up config & auth for rclone
            if cloudmap:
                if remote:
                    stor = get_config()['storage']
                    if stor['provider'] == 'GCS':
                        #create a keyfile & point to it
                        srvacctf = ".rclone_key_%s.json" % stor['settings'][
                            'private_key']['private_key_id']
                        f = open(srvacctf, 'w')
                        json.dump(stor['settings']['private_key'], f)
                        f.close()
                        stor['settings']['service_account_file'] = srvacctf

                    # build  & save rclone.conf
                    s = f"[{stor['config']}]\n"
                    for k, v in stor.items():
                        if k != 'settings':
                            s += f"{k} = {v}\n"
                    for k, v in stor['settings'].items():
                        s += f"{k} = {v}\n"
                    f = open(".rclone.conf", 'w')
                    f.write(s)
                    f.close()

            rsync_ignore_path = os.path.abspath("./.burstignore")
            if not sync_only:  #sync_only means from remote to local
                #sync local working data to host
                if not os.path.exists(rsync_ignore_path):
                    vprint("creating empty .burstignore")
                    os.system("touch .burstignore")
                cmd = 'rsync -rltzu{4} --del --include=.rclone.conf --exclude-from {5} -e "ssh -o StrictHostKeyChecking=no ' \
                      '-o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/. {3}@{1}:{2}/'.format(locpath,
                                            url, path, sshuser, get_rsync_v(), rsync_ignore_path)
                vprint("Synchronizing project folders")
                vvprint(cmd)
                os.system(cmd)

            if get_config().provider == 'GCE':
                # sync service acct creds (for shutdown)
                cmd = 'rsync -rltzu{4} --relative -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/./.burst/{5} {3}@{1}:{2}/'.format(
                    os.path.expanduser('~'), url, path, sshuser, get_rsync_v(),
                    get_config().raw_secret)
                vprint("Synchronizing credentials for shutdown")
                vvprint(cmd)
                os.system(cmd)

            #if fresh launch, clone burst locally for monitor
            if fresh:
                vprint("Installing burst on server")
                vvprint("Delay for apt-get to settle")
                time.sleep(30)  #trust me this helps
                vvprint("Delay done")
                err = do_ssh(f"{sshuser}@{url}",
                             '"%s"' % install_burst_sh)  #notable quoteables
                if err:
                    raise Exception("Failed to install burst on remote server")
            if restart:
                vprint("updating burst installation for monitor")
                err = do_ssh(f"{sshuser}@{url}", '"%s"' % update_burst_sh)
                if err:
                    raise Exception("Failed to update burst on remote server")
                vprint("Starting monitor process for shutdown++")
                #run monitor (in detached screen) to check if user's burst OR rsync is still running
                conf = get_config()
                if conf.provider == "GCE":
                    secret = ".burst/" + conf.raw_secret
                else:
                    secret = conf.secret

                proj = ('--project ' + conf.project) if conf.project else ''
                cmd = f"screen -md bash -c 'cd {path}; /usr/bin/python3 ~/burst/burst/monitor/monitor.py" \
                      f" --ip {url} --access {conf.access} --provider {conf.provider}" \
                      f" --secret={secret} --region {conf.region} {proj} >> ~/burst_monitor.log'"
                vvprint(cmd)
                err = do_ssh(f"{sshuser}@{url}", '"%s"' % cmd)
                if err:
                    raise Exception("Failed to initialize timeout monitor")

        else:
            vprint("burst: running locally")
            remote = ""
            path = os.path.abspath('.')

        if not sync_only:
            #actually build container -- for reals
            vprint("Building docker container")
            cmd = "docker {1} build . --file {2} -t {0} {3}".format(
                DEFAULT_IMAGE, remote, dockerfile, get_piper())
            vvprint(cmd)
            os.system(cmd)

            jupyter = False
            if len(args):
                jupyter = args[0] == 'jupyter'

            #build argument list -- re-quote if whitespace
            s = ""
            for a in args:
                a = a.strip()
                if " " in a:
                    if '"' in a:
                        s += f"'{a}' "
                    else:
                        s += f'"{a}" '
                else:
                    s += f"{a} "
            args = s.rstrip()
            # print ("FINAL args:", args)
            # exit()

            if gpu:
                gpu_args = "--gpus all"
            else:
                gpu_args = ""

            #if mounting storage, add arguments & insert commands before (to mount) and after (to unmount) user-specified args
            cloud_args = ""
            if cloudmap:
                cloud, host = cloudmap.split(":")
                args = f"bash -c 'mkdir -p {host}; rclone mount --vfs-cache-mode writes --vfs-write-back 0 --config .rclone.conf {cloud}: {host} & sleep 3; {args}; umount {host}'"
                cloud_args = " --privileged"

            vprint("Running docker container")
            background_args = "-td" if bgd else "-ti"

            if jupyter:
                if len(ports) == 0:
                    raise Exception("jupyter requires -p (usually 8888)")
                jupargs = f"--label ai.burstable.jupyter={ports[0]}"  #FIXME: document that 1st port is jupyter
            else:
                jupargs = ""

            cmd = f"docker {remote} run {gpu_args} {docker_port_args} --rm {background_args}" \
                  f" --label ai.burstable.shutdown={stop} {jupargs}" \
                  f" -v {path}:/home/burst/work {cloud_args} {DEFAULT_IMAGE} {args}"

            #run main task
            vvprint(cmd)
            vprint("")
            v0print("---------------------OUTPUT-----------------------")
            sys.stdout.flush()
            if bgd:
                cmd = cmd.split()
                docker_container, err = run(cmd)
                print("Running in background mode. Container =",
                      docker_container[:11])
            else:
                os.system(cmd)
            sys.stdout.flush()
            v0print("----------------------END-------------------------")
            sys.stdout.flush()

        #sync data on host back to local (note: we do not delete in this direction lest a fresh machine wipes our local workspace)
        if url and not bgd:
            vprint("Synchronizing folders")
            cmd = "rsync -rltzu{4} --exclude-from {5} -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error' '{3}@{1}:{2}/.' {0}/".format(
                locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path)
            vvprint(cmd)
            err = os.system(cmd + " " + get_piper())
            # print ("RSYNC err:", err)
            if err:
                vvprint("rsync returns:", err)
                vprint(
                    "Your session has timed out. Run 'burst sync' to synchronize data"
                )

    except Exception as ex:
        if get_verbosity() & 64:
            v0print("--------------------------------")
            traceback.print_exc()
            v0print("--------------------------------")
        else:
            print()
        print(ex)
        error = "Exception"
    if tunnel:
        tunnel.kill()
    return error
예제 #6
0
파일: lcloud.py 프로젝트: danx0r/burst
def launch_server(name,
                  vmtype=None,
                  image=None,
                  pubkey=None,
                  conf=None,
                  user=None,
                  gpu=False):
    init(conf)
    vmtype, image = fix_vmtype_and_image(vmtype, image)
    image_full_path = image
    if config.provider == 'EC2':
        images = config.driver.list_images(ex_filters={'name': image})
    elif config.provider == 'GCE':
        #note: GCE libcloud driver list_images is hella borke, list is incomplete so...
        images = []
        for proj in ["deeplearning-platform-release", "ubuntu-os-cloud"]:
            try:
                im = config.driver.ex_get_image(image, ex_project_list=proj)
                images = [im]
                break
            except ResourceNotFoundError:
                pass
    else:
        ims = config.driver.list_images()
        images = [x for x in ims if x.name == image]
    if not images:
        raise Exception("Image %s not found" % image)
    image = images[0]

    vmtypes = [x for x in config.driver.list_sizes() if x.name == vmtype]
    if not vmtypes:
        raise Exception("Instance vmtype %s not found" % vmtype)
    vmtype = vmtypes[0]

    if 'disksize' not in config or config.disksize == None:
        raise Exception(
            "Need to add disksize to config or specify (in gigabytes, eg --disksize=150)"
        )

    vprint(
        "Launching instance image=%s, id=%s, session=%s, type=%s ram=%s disk=%s"
        % (image_full_path, image.id, name, vmtype.id, vmtype.ram,
           config.disksize))

    if pubkey:
        if config.provider == 'EC2':  #Everybody makes it up
            auth = NodeAuthSSHKey(pubkey)
            node = config.driver.create_node(
                name,
                vmtype,
                image,
                auth=auth,
                ex_blockdevicemappings=[  #So sue me
                    {
                        'Ebs.VolumeSize': config.disksize,
                        'DeviceName': '/dev/sda1'
                    }
                ])
        elif config.provider == 'GCE':
            meta = {
                'items': [{
                    'key': 'sshKeys',
                    'value': '%s: %s' % (user, pubkey)
                }]
            }
            if gpu:
                vprint("Launching with GPU")
                node = config.driver.create_node(
                    name,
                    vmtype,
                    image,
                    ex_metadata=meta,
                    ex_accelerator_type=config.default_gpu,
                    ex_accelerator_count=1,
                    ex_on_host_maintenance="TERMINATE")
            else:
                vprint("Launching without GPU")
                node = config.driver.create_node(name,
                                                 vmtype,
                                                 image,
                                                 ex_metadata=meta)
        else:
            raise Exception("Unsupported clown provider: %s" % config.provider)
    else:
        node = config.driver.create_node(name, vmtype, image)
    vprint("Waiting for public IP address to be active")
    config.driver.wait_until_running([node])
    while len(node.public_ips) == 0:
        # node = config.driver.list_nodes(ex_node_ids=[node.id])[0] #refresh node -- is this really necessary
        node = get_server(
            uuid=node.uuid)  #seems necessary to refresh to update state
        vprint("Public IPs:", node.public_ips)
        time.sleep(5)
    vprint("Public IPs:", node.public_ips)
    return node
예제 #7
0
파일: lcloud.py 프로젝트: danx0r/burst
def init(conf=None):
    #init is a one-time thang
    if 'driver' in config:
        return

    if conf == None:
        conf = {}

    yam = conf.get('configfile', os.environ['HOME'] + "/.burst/config.yml")

    if os.path.exists(yam):
        #FIXME: check for local overriding .burst
        f = open(yam)
        yconf = yaml.load(f, Loader=yaml.FullLoader)
        f.close()
        # print("DBBG 1", yconf['compute']['configurations']['Ec2Beta']['disksize'])
        if 'compute_config' in conf:
            compute_config = conf['compute_config']
        else:
            compute_config = yconf['compute']['settings']['default_compute']
            #this got a bit strained. sorry
        storage_config = None
        if 'storage_config' in conf:  #if storage_config passed in, use
            storage_config = conf['storage_config']
        else:
            if 'storage' in yconf:  #otherwise check in config.yml
                storage_config = yconf['storage']['settings'][
                    'default_storage']
        if storage_config:  #if it exists,
            storage = yconf['storage']['configurations'][
                storage_config]  #use it
            storage['config'] = storage_config  #and store the config name too
        yconf = yconf['compute']['configurations'][compute_config]
        # print ("DBBG 2", yconf['disksize'])
        yconf.update(yconf['settings']
                     )  #easier to deal with all attributes at top level
        yconf['compute_config'] = compute_config
        if storage_config:  #if specified,
            yconf['storage'] = storage  #pull storage to top level for ease

    else:
        vprint("config.yml not found")
        yconf = {}  #dummy yconf

    if 'provider' in conf:
        config.provider = conf['provider']
    else:
        if 'provider' in yconf:
            config.provider = yconf['provider']
        else:
            raise Exception(
                "Configuration file %s not available. Try running:\nburst configure"
                % yam)

    for param in [
            'access', 'secret', 'region', 'project', 'default_image',
            'default_vmtype', 'default_gpu_image', 'default_gpu_vmtype',
            'default_gpu', 'storage', 'compute_config', 'disksize'
    ]:
        if param in conf:
            config[param] = conf[param]
        else:
            config[param] = yconf.get(param, None)

    if config.default_vmtype == None or config.default_gpu_vmtype == None:
        vprint("""config.yml syntax has changed:
rename default_size --> default_vmtype
default_gpu_size-->default_gpu_vmtype""")

    cls = get_driver(Provider[config.provider])

    if config.provider == 'EC2':
        config.driver = cls(config.access, config.secret, region=config.region)

    elif config.provider == 'GCE':
        if hasattr(config.secret, 'lower'):  #string points to key file
            privkeypath = config.secret
            config.raw_secret = config.secret
        else:  #if dict, create key file
            config.raw_secret = "%s.json" % config.secret['private_key_id']
            privkeypath = "%s/.burst/%s.json" % (
                os.path.expanduser("~"), config.secret['private_key_id'])
        if not os.path.exists(privkeypath):
            fp = open(privkeypath, 'w')
            json.dump(config.secret, fp)
            fp.close()
        config.driver = cls(config.access,
                            privkeypath,
                            datacenter=config.region,
                            project=config.project)
    else:
        vprint("ERROR: unknown cloud provider", config.provider)
예제 #8
0
파일: burst.py 프로젝트: lebailly/burst
def burst(args,
          sshuser=None,
          url=None,
          uuid=None,
          burst_user=None,
          gpus="",
          ports=None,
          stop=False,
          image=None,
          size=None,
          pubkey=None,
          dockerfile="Dockerfile",
          cloudmap="",
          dockerdport=2376,
          conf=None):
    tunnel = None
    try:
        if not os.path.exists(dockerfile):
            raise Exception("Dockerfile not found")
        if not os.path.exists(".dockerignore"):
            raise Exception("""

.dockerignore file not found. Burst requires a .dockerignore to avoid sending excess data to docker build.
Because the working directory is rsync'd to the remote host, you typically only need to send the Dockerfile
and files that are referred to (such as requirements.txt) to the build daemon.

#Template .dockerignore
#Default to ignoring everything:
**
#exceptions (These will be sent to the docker daemon for building):
!/Dockerfile*
!requirements.txt
""")

        #if url specified, split into user & IP
        if url:
            if not sshuser:
                sshuser, url = url.split('@')

        #launch, restart, or reconnect to node
        node = None

        #unless running --local:
        if url or uuid or burst_user:

            #if server does not exist, launch a fresh one
            fresh = False
            restart = False
            node = get_server(url=url, uuid=uuid, name=burst_user, conf=conf)
            if burst_user and not node:
                node = launch_server(burst_user,
                                     pubkey=pubkey,
                                     size=size,
                                     image=image,
                                     conf=conf,
                                     user=sshuser,
                                     gpus=gpus)
                fresh = True
                restart = True
            if node:

                #if stopped, restart
                if node.state.lower() != "running":
                    restart = True
                    vprint("Starting server")
                    node = start_server(node)

                #by now we must have a public IP address
                url = node.public_ips[0]

                #wait for ssh daemon to be ready
                vprint("Waiting for sshd")
                cmd = [
                    "ssh", "-o StrictHostKeyChecking=no",
                    "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error",
                    "{0}@{1}".format(sshuser, url), "echo", "'sshd responding'"
                ]
                vvprint(cmd)
                good = False
                for z in range(10, -1, -1):
                    ret = run(cmd, timeout=15)
                    if ret[0].strip()[-15:] == 'sshd responding':
                        good = True
                        break
                    vprint("sshd not responding; %d attempts left" % z)
                    if z:
                        time.sleep(5)
                if not good:
                    raise Exception("error in ssh call: %s" % ret[0].strip())
                vvprint("SSH returns -->%s|%s<--" % ret)
            else:
                raise Exception("Error: node not found")

        #we have a url unless running --local:
        if url:

            #if just launched, install docker
            if fresh:
                print("Configuring Docker")
                # 'sudo apt-get -y update; sudo apt-get -y install docker.io; ' \ #images have docker installed
                cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \
                      '"sudo usermod -a -G docker ubuntu; ' \
                      'sudo systemctl unmask docker; sudo service docker start"'.format(sshuser, url)
                vvprint(cmd)
                os.system(cmd)

            vprint("Connecting through ssh")

            #set up ssh tunnel mapping docker socket, ports
            host_port_args = []
            docker_port_args = ""
            if ports:
                for pa in ports:
                    if ':' in pa:
                        local_port, remote_port = pa.split(':')
                    else:
                        remote_port = local_port = pa
                    docker_port_args += " -p {0}:{0}".format(remote_port)
                    host_port_args.append("-L {0}:localhost:{1}".format(
                        local_port, remote_port))
            # print ("PORTS: |%s|%s|" % (docker_port_args, host_port_args)); exit()
            remote = "-H localhost:%s" % dockerdport
            ssh_args = [
                "ssh", "-o StrictHostKeyChecking=no",
                "-o UserKnownHostsFile=/dev/null", "-o LogLevel=error", "-NL",
                "{0}:/var/run/docker.sock".format(dockerdport),
                "{0}@{1}".format(sshuser, url)
            ]
            for arg in host_port_args:
                ssh_args.insert(3, arg)
            vvprint(ssh_args)
            tunnel = subprocess.Popen(ssh_args)
            time.sleep(2)

            #path = absolute working directory on host
            relpath = os.path.abspath('.')[len(os.path.expanduser('~')):]
            relpath = "/_BURST" + relpath.replace('/', '_')  #I can exlain
            locpath = os.path.abspath('.')
            path = "/home/{0}{1}".format(sshuser, relpath)

            #part of check to see if docker is installed and running
            cmd = [
                "docker", "{0}".format(remote), "ps", "--format", '{{json .}}'
            ]
            vvprint(cmd)
            out = run(cmd)
            vvprint("PS returns -->%s|%s<--" % out)
            monitor_running = False
            if out[1]:
                for line in out[0].split("\n"):
                    if not line:
                        continue
                    j = json.loads(line)
                    # pprint(j)
                    # print ("RUNNING:", j['Image'], j['Labels'])
                    for x in j['Labels'].split(','):
                        if 'ai.burstable.monitor=' == x:
                            monitor_running = True
            vprint("monitor_running: %s" % monitor_running)

            #if restarted (including fresh launch), start monitor docker process
            if restart or not monitor_running:
                #put sentinel script in working dir; gets rsync'd to host
                if not os.path.exists(".burst-sentinel.py"):
                    vvprint("creating .burst-sentinel.py in",
                            os.path.abspath('.'))
                    f = open(".burst-sentinel.py", 'w')
                    f.write(burst_sentinel_py)
                    f.close()

                vprint("Starting monitor process for shutdown++")
                #run monitor (in docker container) to check if user's burst OR rsync is still running
                conf = get_config()
                if conf.provider == "GCE":
                    secret = ".burst/" + conf.raw_secret
                else:
                    secret = conf.secret
                # print("SECRET 1:", secret)
                cmd = f"docker {remote} run --label 'ai.burstable.monitor' " \
                      f"--rm {get_dockrunflags()}  -v /var/run/docker.sock:/var/run/docker.sock" \
                      f" {MONITOR_IMAGE} burst-monitor" \
                      f" --ip {url} --access {conf.access} --provider {conf.provider} {get_piper()}" \
                      f" --secret={secret} --region {conf.region} {('--project ' + conf.project) if conf.project else ''}"
                vvprint(cmd)
                vvprint("Shutdown process container ID:")
                os.system(cmd)

            #prepare to build docker container
            vprint("Removing topmost layer")  #to avoid running stale image
            cmd = [
                "docker", "{0}".format(remote), "rmi", "--no-prune",
                DEFAULT_IMAGE
            ]
            vvprint(cmd)
            out, err = run(cmd)
            vvprint(out)
            size, image = fix_size_and_image(size, image)
            if size and size != get_server_size(node):  #FIXME
                raise Exception(
                    "Cannot change size (instance type) -- need to re-launch")

            # get_server_image is broken, need to prompt better here
            # if image and image != get_server_image(node):
            #     raise Exception("FIXME: cannot change host image -- need to terminate & re-launch server")

            vprint("burst: name %s size %s image %s url %s" %
                   (node.name, size, image, url))

            #if using cloud storage (s3 etc), set up config & auth for rclone
            if cloudmap:
                if remote:
                    stor = get_config()['storage']
                    if stor['provider'] == 'GCS':
                        #create a keyfile & point to it
                        srvacctf = ".rclone_key_%s.json" % stor['settings'][
                            'private_key']['private_key_id']
                        f = open(srvacctf, 'w')
                        json.dump(stor['settings']['private_key'], f)
                        f.close()
                        stor['settings']['service_account_file'] = srvacctf

                    # build  & save rclone.conf
                    s = f"[{stor['config']}]\n"
                    for k, v in stor.items():
                        if k != 'settings':
                            s += f"{k} = {v}\n"
                    for k, v in stor['settings'].items():
                        s += f"{k} = {v}\n"
                    f = open(".rclone.conf", 'w')
                    f.write(s)
                    f.close()

            #sync local working data to host
            rsync_ignore_path = os.path.abspath("./.burstignore")
            if not os.path.exists(rsync_ignore_path):
                vprint("creating empty .burstignore")
                os.system("touch .burstignore")
            cmd = 'rsync -rltzu{4} --exclude-from {5} -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/. {3}@{1}:{2}/'.format(
                locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path)
            vprint("Synchronizing project folders")
            vvprint(cmd)
            os.system(cmd)

            # if get_config().provider == 'GCE':
            #     # sync service acct creds (for shutdown)
            #     cmd = 'rsync -rltzu{4} --relative -e "ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error" {0}/./.burst/{5} {3}@{1}:{2}/'.format(os.path.expanduser('~'),
            #                             url, path, sshuser, get_rsync_v(), get_config().raw_secret)
            #     vprint("Synchronizing credentials for shutdown")
            #     vvprint (cmd)
            #     os.system(cmd)

            if restart or not monitor_running:
                vprint("Starting host sentinel process for shutdown")

                #set up sentinel script in detached screen on host (not docker) to help check on rsync
                cmd = 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error {0}@{1} ' \
                      '"cd {2} ; screen -md python3 .burst-sentinel.py"'.format(sshuser, url, path)
                vvprint(cmd)
                os.system(cmd)

        else:
            vprint("burst: running locally")
            remote = ""
            path = os.path.abspath('.')

        #actually build container -- for reals
        vprint("Building docker container")
        cmd = "docker {1} build . --file {2} -t {0} {3}".format(
            DEFAULT_IMAGE, remote, dockerfile, get_piper())
        vvprint(cmd)
        os.system(cmd)

        args = " ".join(args)
        gpu_args = "--gpus " + gpus if gpus else ""

        #if mounting storage, add arguments & insert commands before (to mount) and after (to unmount) user-specified args
        cloud_args = ""
        if cloudmap:
            cloud, host = cloudmap.split(":")
            args = f"bash -c 'mkdir -p {host}; rclone mount --vfs-cache-mode writes --vfs-write-back 0 --config .rclone.conf {cloud}: {host} & sleep 3; {args}; umount {host}'"
            cloud_args = " --privileged"

        vprint("Running docker container")
        cmd = "docker {3} run {4} {5} --rm -ti --label ai.burstable.shutdown={7} -v {2}:/home/burst/work {6} {0} {1}".format(
            DEFAULT_IMAGE, args, path, remote, gpu_args, docker_port_args,
            cloud_args, stop)

        #run user-specified args
        vvprint(cmd)
        vprint("")
        v0print("---------------------OUTPUT-----------------------")
        sys.stdout.flush()
        os.system(cmd)
        sys.stdout.flush()
        v0print("----------------------END-------------------------")
        sys.stdout.flush()

        #sync data on host back to local
        if url:
            vprint("Synchronizing folders")
            cmd = "rsync -rltzu{4} --exclude-from {5} -e 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=error' '{3}@{1}:{2}/.' {0}/".format(
                locpath, url, path, sshuser, get_rsync_v(), rsync_ignore_path)
            vvprint(cmd)
            os.system(cmd)

    except Exception as ex:
        if get_verbosity() >= 256:
            v0print("--------------------------------")
            traceback.print_exc()
            v0print("--------------------------------")
        else:
            print()
        print(ex)

    # if url and node:
    #     # set up shutdown process
    #     if stop == 0:
    #         vprint ("Stopping VM at %s immediately as instructed" % url)
    #         stop_server(node)
    #     else:
    #         vprint ("Scheduling shutdown of VM at %s for %d seconds from now" % (url, stop))
    #         conf = get_config()
    #         if conf.provider == "GCE":
    #             secret = ".burst/" + conf.raw_secret
    #         else:
    #             secret = conf.secret
    #         # print("SECRET 1:", secret)
    #         cmd = f"docker {remote} run --rm {get_dockrunflags()} -v {path}:/home/burst/work {MONITOR_IMAGE} burst" \
    #               f" --verbosity {get_verbosity()} --stop_instance_by_url {url} --delay {stop} --access {conf.access}" \
    #               f" --secret={secret} --region {conf.region} {('--project ' + conf.project) if conf.project else ''}" \
    #               f" --provider {conf.provider} {get_piper()}"
    #         vvprint (cmd)
    #         vvprint ("Shutdown process container ID:")
    #         os.system(cmd)

    if tunnel:
        tunnel.kill()
예제 #9
0
파일: burst.py 프로젝트: lebailly/burst
        args_compute.provider = args.provider
    else:
        burst_conf = {}

        #command line overrides:
        if args.compute_config:
            burst_conf['compute_config'] = args.compute_config

        if args.storage_config:
            burst_conf['storage_config'] = args.storage_config

        if args.configfile:
            burst_conf['configfile'] = args.configfile

    if args.local and (args.uuid or args.url):
        vprint(args)
        parser.error(
            "when specifying --local, do not set --sshuser, --burst_user, --uuid, or --url"
        )
        exit()
    t0 = time.time()
    while time.time() - t0 < args.delay:
        vprint("%d seconds till action" % (args.delay + .5 + t0 - time.time()))
        time.sleep(5)

    #set default burst_user if necessary:
    if not (args.burst_user or args.uuid or args.url or args.local
            or args.version):
        burst_user = getpass.getuser()
        args.burst_user = "******" + burst_user
        vprint("Session: %s" % args.burst_user)