Example #1
0
def cli() -> None:
    """
    Create and manage PyTorch OSS On-Demand machines. Machines are provisioned in
    AWS based on the most recent build of the 'viable/strict' branch of PyTorch.

    This tool provisions SSH keys so only you are able to log in and verifies
    that you are an active FB employee. A GitHub OAuth token is required to
    enable pushing from the on-demand's PyTorch repo.

    Note: On-demands are stopped every night at 3 AM PST. A stopped on-demand's
    data will still be there when it is re-started. Once an on-demand has not
    been started for 3 days it will be permanently terminated (and the data will
    be lost). TODO: This is unimplemented
    """
    init()
    init_logger()

    def exception_handler(exc_type: Any, exc_value: Any,
                          exc_traceback: Any) -> None:
        get_logger().error("Uncaught exception",
                           exc_info=(exc_type, exc_value, exc_traceback))
        sys.__excepthook__(exc_type, exc_value, exc_traceback)

    sys.excepthook = exception_handler
    log(f"Invoked with: {sys.argv}")
Example #2
0
def stop(name: Optional[str], all: bool, id: Optional[str],
         action: str) -> None:
    """
    Delete an on-demand. Use '--action stop' to pause an on-demand, or leave this
    option off to permanently terminate an on-demand.
    """
    with yaspin.yaspin(text=TimedText("Gathering instances")) as spinner:
        ids_to_stop = []
        if all:
            user_instances = get_instances_for_user(username())
            log("Stopping all instances")
            for instance in user_instances:
                ids_to_stop.append(instance["InstanceId"])
        else:
            if name is None and id is None:
                to_stop = instance_for_id_or_name_or_guess(id, name)
                ids_to_stop.append(to_stop["InstanceId"])
            else:
                user_instances = get_instances_for_user(username())
                to_stop = instance_for_id_or_name(id, name, user_instances)

                if to_stop is None:
                    raise RuntimeError(f"Instance {name} not found")

                ids_to_stop.append(to_stop["InstanceId"])

        ok(spinner)

    log(f"Setting instances {ids_to_stop} to {action}")
    stop_instances(action, ids_to_stop)
Example #3
0
def list(full: bool) -> None:
    """
    List all your current on-demands
    """
    log(f"Fetching full: {full}")
    rows = get_live_ondemands(full=full)
    log(f"{rows}")

    if len(rows) == 0:
        print("No on-demands found! Start one with 'aws_od_cli create'")
    else:
        print(
            tabulate.tabulate([d.values() for d in rows],
                              headers=[k for k in rows[0].keys()]))
Example #4
0
def vscode(id: Optional[str], name: Optional[str],
           folder: Optional[str]) -> None:
    """
    Launch vscode for a remote

    If you only have a single on-demand the --id or --name flags aren't necessary.
    Also see 'aws_od_cli list'.
    """
    code_exe = locate_vscode()
    log(f"Found VSCode at {code_exe}")
    instance = instance_for_id_or_name_or_guess(id, name)
    name = instance["InstanceId"]
    if folder is None:
        folder = "/home/ubuntu/pytorch"
    elif not folder.startswith("/"):
        folder = "/home/ubuntu/" + folder

    run_cmd([
        str(code_exe), "--folder-uri",
        f"vscode-remote://ssh-remote+{name}{folder}"
    ])
Example #5
0
def create(
    no_login: bool,
    no_files: bool,
    no_rm: bool,
    gpu: bool,
    user_ami: Optional[str],
    user_instance_type: Optional[str],
    volume_size: int,
) -> None:
    """
    Create a new on-demand

    TODO: this doesn't work when Packer is updating the AMI (since it goes into
    pending status), there should be a fallback AMI that's the old one
    
    """
    rm = not no_rm
    if no_login and rm:
        raise RuntimeError(
            "--rm can only be used when auto-ssh is enabled, so remove the --no-login flag"
        )
    instance_type = "c5a.4xlarge"
    if gpu and user_instance_type is not None:
        raise RuntimeError("Cannot use both --gpu and --instance-type")
    if gpu and user_ami is not None:
        raise RuntimeError("Cannot use both --gpu and --ami")

    if gpu:
        instance_type = "g4dn.8xlarge"
    if user_instance_type is not None:
        instance_type = user_instance_type

    log(f"Using instance_type {instance_type}")

    if user_ami is not None:
        ami = {"ImageId": user_ami}
    else:
        ami = find_ami(gpu=gpu)

    log(f"Using ami {ami}")

    key_path = find_or_create_ssh_key()
    log(f"Using key {key_path}")

    # TODO: corp net sec group
    security_group = find_security_group("ondemand_ssh_and_mosh")

    # Make the instance via boto3
    instances, name = create_instance(
        ami,
        key_path,
        instance_type,
        use_startup_script=not no_files,
        security_group=security_group,
        volume_size=volume_size,
    )
    instance = instances["Instances"][0]

    log(f"Made instance {instance}")

    # Get it's DNS name and write it to an SSH config for later access
    instance = wait_for_ip_address(instance)
    write_ssh_configs(instance)

    # Spin until ssh <instance-id> runs successfully
    instance = wait_for_ssh_access(instance)
    ssh_dest = instance["InstanceId"]

    log(f"Using SSH destination {ssh_dest}")

    if not no_files:
        recorded_files = copy_files(ssh_dest, load_files())
        save_files(recorded_files)

    if no_login:
        print(
            textwrap.dedent(f"""
            Instance created! Log in with:

                aws_od_cli ssh --name {name}
        """))
    else:
        ssh_impl(ssh_dest)

        sync_files_to_local(ssh_dest, load_files())
        if rm:
            was_stopped = ask_to_stop_instance(instance)
            if not was_stopped:
                print("Manual actions:\n"
                      f"    SSH: aws_od_cli ssh --name {name}\n"
                      f" Remove: aws_od_cli stop --name {name}\n")