def _block_until_ssh_ready(host: str) -> None: nprint_header(f"Waiting for instance to be ready for ssh at {host}. " "This can take up to 2 minutes... ") start = time.monotonic() sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.settimeout(1) reconnect_count = 0 while reconnect_count < CONFIG.ssh_timeout: error_num = sock.connect_ex((host, 22)) if error_num == 0: break time.sleep(1) reconnect_count += 1 else: raise RuntimeError( "Something went wrong while connecting to the instance.\n" "Please verify your security groups, instance key and " "instance profile, and try again.\n" "More info at docs.nimbo.sh/common-issues#cant-ssh.\n") nprint_header(f"Ready. (%0.3f s)" % (time.monotonic() - start))
def delete_instance(instance_id: str, dry_run=False) -> None: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) status = response["TerminatingInstances"][0]["CurrentState"][ "Name"] nprint_header(f"Instance [green]{instance_id}[/green]: {status}") except botocore.exceptions.ClientError as e: if "DryRunOperation" not in str(e): raise
def delete_all_instances(dry_run=False) -> None: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( Filters=[{ "Name": "instance-state-name", "Values": ["running"] }] + AwsInstance._make_instance_filters(), DryRun=dry_run, ) for reservation in response["Reservations"]: for inst in reservation["Instances"]: instance_id = inst["InstanceId"] delete_response = ec2.terminate_instances( InstanceIds=[instance_id], ) status = delete_response["TerminatingInstances"][0][ "CurrentState"]["Name"] nprint_header( f"Instance [green]{instance_id}[/green]: {status}") except botocore.exceptions.ClientError as e: if "DryRunOperation" not in str(e): raise
def setup(profile: str, full_s3_access=False) -> None: session = boto3.Session(profile_name=profile) account = session.client("sts").get_caller_identity()["Account"] iam = session.client("iam") nprint_header(f"Creating user group {NIMBO_USER_GROUP}...") AwsPermissions._create_group(iam, NIMBO_USER_GROUP) nprint_header(f"Creating policy {EC2_POLICY_NAME}...") AwsPermissions._create_policy(iam, EC2_POLICY_NAME, EC2_POLICY_JSON) nprint_header( f"Attaching policy {EC2_POLICY_NAME} to user group {NIMBO_USER_GROUP}..." ) iam.attach_group_policy( GroupName=NIMBO_USER_GROUP, PolicyArn=f"arn:aws:iam::{account}:policy/{EC2_POLICY_NAME}", ) if full_s3_access: nprint_header(f"Creating role {S3_ACCESS_ROLE_NAME}...") AwsPermissions._create_role_and_instance_profile( iam, S3_ACCESS_ROLE_NAME) nprint_header( f"Attaching AmazonS3FullAccess policy to role {S3_ACCESS_ROLE_NAME}..." ) iam.attach_role_policy( PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess", RoleName=S3_ACCESS_ROLE_NAME, ) nprint_header(f"Creating policy {PASS_ROLE_POLICY_NAME}...") pass_role_policy_json = { "Version": "2012-10-17", "Statement": [{ "Sid": "NimboPassRolePolicy", "Effect": "Allow", "Action": "iam:PassRole", "Resource": f"arn:aws:iam::*:role/{S3_ACCESS_ROLE_NAME}", }], } AwsPermissions._create_policy(iam, PASS_ROLE_POLICY_NAME, pass_role_policy_json) nprint_header(f"Attaching policy {PASS_ROLE_POLICY_NAME}" f" to user group {NIMBO_USER_GROUP}...") iam.attach_group_policy( GroupName=NIMBO_USER_GROUP, PolicyArn= f"arn:aws:iam::{account}:policy/{PASS_ROLE_POLICY_NAME}", ) nprint_header(f"Attaching policy AmazonS3FullAccess" f" to user group {NIMBO_USER_GROUP}...") iam.attach_group_policy( GroupName=NIMBO_USER_GROUP, PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess", ) else: nprint( "\nSince you chose not to give full S3 access to the Nimbo user group" " and instance role,\nwe recommend that you create a role with the" " necessary S3 permissions in the AWS console.\nOnce you do this, give" " the role name to the people using Nimbo so that they can set\n" "the 'role' field in the nimbo-config.yml to this value.", style="warning", ) print() nprint_header("Done.") nprint_header("To add users to the NimboUserGroup, simply" " run 'nimbo add-user USERNAME YOUR_AWS_PROFILE'.\n" "For more info use 'nimbo add-user --help'")
def _start_instance() -> str: AwsPermissions.allow_ingress_current_ip(CONFIG.security_group) ec2 = CONFIG.get_session().client("ec2") instance_tags = AwsInstance._make_instance_tags() instance_filters = AwsInstance._make_instance_filters() image = AwsInstance._get_image_id() nprint_header(f"Launching instance with image {image}... ") ebs_config = { "VolumeSize": CONFIG.disk_size, "VolumeType": CONFIG.disk_type, } if CONFIG.disk_iops: ebs_config["Iops"] = CONFIG.disk_iops instance_config = { "BlockDeviceMappings": [{ "DeviceName": "/dev/sda1", "Ebs": ebs_config }], "ImageId": image, "InstanceType": CONFIG.instance_type, "KeyName": Path(CONFIG.instance_key).stem, "Placement": { "Tenancy": "default" }, "SecurityGroups": [CONFIG.security_group], "IamInstanceProfile": { "Name": CONFIG.role }, } if CONFIG.spot: extra_kwargs = {} if CONFIG.spot_duration: extra_kwargs = {"BlockDurationMinutes": CONFIG.spot_duration} instance = ec2.request_spot_instances( LaunchSpecification=instance_config, TagSpecifications=[{ "ResourceType": "spot-instances-request", "Tags": instance_tags }], **extra_kwargs, ) instance_request = instance["SpotInstanceRequests"][0] request_id = instance_request["SpotInstanceRequestId"] try: nprint_header("Spot instance request submitted.") nprint_header( "Waiting for the spot instance request to be fulfilled... " ) status = "" while status != "fulfilled": time.sleep(2) response = ec2.describe_spot_instance_requests( SpotInstanceRequestIds=[request_id], Filters=instance_filters, ) instance_request = response["SpotInstanceRequests"][0] status = instance_request["Status"]["Code"] if status not in [ "fulfilled", "pending-evaluation", "pending-fulfillment", ]: raise Exception( response["SpotInstanceRequests"][0]["Status"]) except KeyboardInterrupt: ec2.cancel_spot_instance_requests( SpotInstanceRequestIds=[request_id]) nprint_header("Cancelled spot instance request.") sys.exit(1) nprint_header("Done.") ec2.create_tags( Resources=[instance_request["InstanceId"]], Tags=instance_tags, ) instance = instance_request else: instance_config["MinCount"] = 1 instance_config["MaxCount"] = 1 instance_config["InstanceInitiatedShutdownBehavior"] = "terminate" instance_config["TagSpecifications"] = [{ "ResourceType": "instance", "Tags": instance_tags }] instance = ec2.run_instances(**instance_config) instance = instance["Instances"][0] return instance["InstanceId"]
def run(job_cmd: str, dry_run=False) -> Dict[str, str]: if dry_run: return {"message": job_cmd + "_dry_run"} # Launch instance with new volume for anaconda telemetry.record_event("run") start_t = time.monotonic() instance_id = AwsInstance._start_instance() try: # Wait for the instance to be running AwsInstance._block_until_instance_running(instance_id) end_t = time.monotonic() nprint_header( f"Instance running. ({round((end_t - start_t), 2)} s)") nprint_header(f"InstanceId: [green]{instance_id}[/green]") print() time.sleep(5) host = AwsInstance._get_host_from_instance_id(instance_id) AwsInstance._block_until_ssh_ready(host) if job_cmd == "_nimbo_launch": nprint_header( f"Run [cyan]nimbo ssh {instance_id}[/cyan] to log onto the instance" ) return { "message": job_cmd + "_success", "instance_id": instance_id } ssh = ( f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" " -o ServerAliveInterval=5 ") scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" local_env = "/tmp/local_env.yml" user_conda_yml = CONFIG.conda_env # TODO: Replace this with shutil subprocess.check_output(f"cp {user_conda_yml} {local_env}", shell=True) # Send conda env yaml and setup scripts to instance print() nprint_header(f"Syncing conda, config, and setup files...") AwsInstance._write_nimbo_vars() # Create project folder and send env and config files there subprocess.check_output(f"{ssh} ubuntu@{host} mkdir project", shell=True) subprocess.check_output( f"{scp} {local_env} {CONFIG.nimbo_config_file} {NIMBO_VARS}" f" ubuntu@{host}:/home/ubuntu/project/", shell=True, ) # Sync code with instance print() nprint_header(f"Syncing code...") AwsInstance._sync_code(host) nprint_header(f"Running setup code on the instance from here on.") # Run remote_setup script on instance AwsInstance._run_remote_script(ssh, scp, host, instance_id, job_cmd, "remote_setup.sh") if job_cmd == "_nimbo_notebook": subprocess.Popen( f"{ssh} -o 'ExitOnForwardFailure yes' " f"ubuntu@{host} -NfL 57467:localhost:57467 >/dev/null 2>&1", shell=True, ).communicate() nprint_header( "Make sure to run 'nimbo sync-notebooks <instance_id>' frequently " "to sync the notebook to your local folder, as the remote notebooks" " will be lost once the instance is terminated.") return { "message": job_cmd + "_success", "instance_id": instance_id } except BaseException as e: if (type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError): nprint(e, style="error") if not CONFIG.persist: nprint_header( f"Deleting instance {instance_id} (from local)... ") AwsInstance.delete_instance(instance_id) return {"message": job_cmd + "_error", "instance_id": instance_id}
def run_access_test(dry_run=False) -> None: if dry_run: return CONFIG.instance_type = "t3.medium" CONFIG.run_in_background = False CONFIG.persist = False try: # Send test file to s3 results path and delete it profile = CONFIG.aws_profile region = CONFIG.region_name results_path = CONFIG.s3_results_path subprocess.check_output( "echo 'Hello World' > nimbo-access-test.txt", shell=True) command = AwsStorage.mk_s3_command("cp", "nimbo-access-test.txt", results_path) subprocess.check_output(command, shell=True) command = f"aws s3 ls {results_path} --profile {profile} --region {region}" subprocess.check_output(command, shell=True) command = (f"aws s3 rm {results_path}/nimbo-access-test.txt " f"--profile {profile} --region {region}") subprocess.check_output(command, shell=True) print("You have the necessary S3 read/write " "permissions from your computer \u2713") except subprocess.CalledProcessError as e: nprint(e, style="error") sys.exit(1) # Launch instance with new volume for anaconda print("Launching test instance... ") instance_id = AwsInstance._start_instance() try: # Wait for the instance to be running AwsInstance._block_until_instance_running(instance_id) print(f"Instance running. Instance creation allowed \u2713") print(f"InstanceId: {instance_id}") print() print("Trying to delete this instance...") AwsInstance.delete_instance(instance_id) print("Instance deletion allowed \u2713") print("\nLaunching another instance...") instance_id = AwsInstance._start_instance() print(f"Instance running. InstanceId: {instance_id}") time.sleep(5) host = AwsInstance._get_host_from_instance_id(instance_id) ssh = ( f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no' " "-o ServerAliveInterval=20") scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" AwsInstance._block_until_ssh_ready(host) print("Instance key allows ssh access to remote instance \u2713") print("Security group allows ssh access to remote instance \u2713") AwsInstance._write_nimbo_vars() subprocess.check_output( f"{scp} {CONFIG.nimbo_config_file} {NIMBO_VARS} " + f"ubuntu@{host}:/home/ubuntu/", shell=True, ) AwsInstance._run_remote_script(ssh, scp, host, instance_id, "", "remote_s3_test.sh") except BaseException as e: if (type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError): nprint(e, style="error") if not CONFIG.persist: nprint_header( f"Deleting instance {instance_id} (from local)...") AwsInstance.delete_instance(instance_id) sys.exit(1)