def cli() -> None: """ Create and manage PyTorch OSS On-Demand machines. Machines are provisioned in AWS based on the most recent build of the 'viable/strict' branch of PyTorch. This tool provisions SSH keys so only you are able to log in and verifies that you are an active FB employee. A GitHub OAuth token is required to enable pushing from the on-demand's PyTorch repo. Note: On-demands are stopped every night at 3 AM PST. A stopped on-demand's data will still be there when it is re-started. Once an on-demand has not been started for 3 days it will be permanently terminated (and the data will be lost). TODO: This is unimplemented """ init() init_logger() def exception_handler(exc_type: Any, exc_value: Any, exc_traceback: Any) -> None: get_logger().error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback)) sys.__excepthook__(exc_type, exc_value, exc_traceback) sys.excepthook = exception_handler log(f"Invoked with: {sys.argv}")
def stop(name: Optional[str], all: bool, id: Optional[str], action: str) -> None: """ Delete an on-demand. Use '--action stop' to pause an on-demand, or leave this option off to permanently terminate an on-demand. """ with yaspin.yaspin(text=TimedText("Gathering instances")) as spinner: ids_to_stop = [] if all: user_instances = get_instances_for_user(username()) log("Stopping all instances") for instance in user_instances: ids_to_stop.append(instance["InstanceId"]) else: if name is None and id is None: to_stop = instance_for_id_or_name_or_guess(id, name) ids_to_stop.append(to_stop["InstanceId"]) else: user_instances = get_instances_for_user(username()) to_stop = instance_for_id_or_name(id, name, user_instances) if to_stop is None: raise RuntimeError(f"Instance {name} not found") ids_to_stop.append(to_stop["InstanceId"]) ok(spinner) log(f"Setting instances {ids_to_stop} to {action}") stop_instances(action, ids_to_stop)
def list(full: bool) -> None: """ List all your current on-demands """ log(f"Fetching full: {full}") rows = get_live_ondemands(full=full) log(f"{rows}") if len(rows) == 0: print("No on-demands found! Start one with 'aws_od_cli create'") else: print( tabulate.tabulate([d.values() for d in rows], headers=[k for k in rows[0].keys()]))
def vscode(id: Optional[str], name: Optional[str], folder: Optional[str]) -> None: """ Launch vscode for a remote If you only have a single on-demand the --id or --name flags aren't necessary. Also see 'aws_od_cli list'. """ code_exe = locate_vscode() log(f"Found VSCode at {code_exe}") instance = instance_for_id_or_name_or_guess(id, name) name = instance["InstanceId"] if folder is None: folder = "/home/ubuntu/pytorch" elif not folder.startswith("/"): folder = "/home/ubuntu/" + folder run_cmd([ str(code_exe), "--folder-uri", f"vscode-remote://ssh-remote+{name}{folder}" ])
def create( no_login: bool, no_files: bool, no_rm: bool, gpu: bool, user_ami: Optional[str], user_instance_type: Optional[str], volume_size: int, ) -> None: """ Create a new on-demand TODO: this doesn't work when Packer is updating the AMI (since it goes into pending status), there should be a fallback AMI that's the old one """ rm = not no_rm if no_login and rm: raise RuntimeError( "--rm can only be used when auto-ssh is enabled, so remove the --no-login flag" ) instance_type = "c5a.4xlarge" if gpu and user_instance_type is not None: raise RuntimeError("Cannot use both --gpu and --instance-type") if gpu and user_ami is not None: raise RuntimeError("Cannot use both --gpu and --ami") if gpu: instance_type = "g4dn.8xlarge" if user_instance_type is not None: instance_type = user_instance_type log(f"Using instance_type {instance_type}") if user_ami is not None: ami = {"ImageId": user_ami} else: ami = find_ami(gpu=gpu) log(f"Using ami {ami}") key_path = find_or_create_ssh_key() log(f"Using key {key_path}") # TODO: corp net sec group security_group = find_security_group("ondemand_ssh_and_mosh") # Make the instance via boto3 instances, name = create_instance( ami, key_path, instance_type, use_startup_script=not no_files, security_group=security_group, volume_size=volume_size, ) instance = instances["Instances"][0] log(f"Made instance {instance}") # Get it's DNS name and write it to an SSH config for later access instance = wait_for_ip_address(instance) write_ssh_configs(instance) # Spin until ssh <instance-id> runs successfully instance = wait_for_ssh_access(instance) ssh_dest = instance["InstanceId"] log(f"Using SSH destination {ssh_dest}") if not no_files: recorded_files = copy_files(ssh_dest, load_files()) save_files(recorded_files) if no_login: print( textwrap.dedent(f""" Instance created! Log in with: aws_od_cli ssh --name {name} """)) else: ssh_impl(ssh_dest) sync_files_to_local(ssh_dest, load_files()) if rm: was_stopped = ask_to_stop_instance(instance) if not was_stopped: print("Manual actions:\n" f" SSH: aws_od_cli ssh --name {name}\n" f" Remove: aws_od_cli stop --name {name}\n")