def create_instance_profile_and_role(dry_run=False): iam = CONFIG.get_session().client("iam") role_name = "NimboS3AndEC2FullAccess" instance_profile_name = "NimboInstanceProfile" policy = { "Version": "2012-10-17", "Statement": { "Effect": "Allow", "Action": "sts:AssumeRole", "Principal": { "Service": "ec2.amazonaws.com" }, }, } if dry_run: return iam.create_role(RoleName=role_name, AssumeRolePolicyDocument=json.dumps(policy)) iam.attach_role_policy( PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess", RoleName=role_name) iam.attach_role_policy( PolicyArn="arn:aws:iam::aws:policy/AmazonEC2FullAccess", RoleName=role_name) iam.create_instance_profile(InstanceProfileName=instance_profile_name, Path="/") iam.add_role_to_instance_profile(InstanceProfileName=instance_profile_name, RoleName=role_name)
def verify_nimbo_instance_profile(dry_run=False): iam = CONFIG.get_session().client("iam") if dry_run: return response = iam.list_instance_profiles() instance_profiles = response["InstanceProfiles"] instance_profile_names = [ p["InstanceProfileName"] for p in instance_profiles ] if "NimboInstanceProfiles" not in instance_profile_names: raise Exception( textwrap.dedent( """Instance profile 'NimboInstanceProfile' not found. An instance profile is necessary to give your instance access to EC2 and S3 resources. You can create an instance profile using 'nimbo create_instance_profile <role_name>'. If you are a root user, you can simply run 'nimbo create_instance_profile_and_role', and nimbo will create the necessary role policies and instance profile for you. Otherwise, please ask your admin for a role that provides the necessary EC2 and S3 read/write access. For more details please go to docs.nimbo.sh/instance-profiles." """))
def list_buckets(): s3 = CONFIG.get_session().client("s3") response = s3.list_buckets() print("Existing buckets:") for bucket in response["Buckets"]: print(f' {bucket["Name"]}')
def list_spot_gpu_prices(dry_run=False): if dry_run: return instance_types = list(sorted(ec2_instance_types())) instance_types = [ inst for inst in instance_types if inst[:2] in ["p2", "p3", "p4"] or inst[:3] in ["g4d"] ] ec2 = CONFIG.get_session().client("ec2") string = format_price_string("InstanceType", "Price ($/hour)", "GPUs", "CPUs", "Mem (Gb)") print(string) for instance_type in instance_types: response = ec2.describe_spot_price_history( InstanceTypes=[instance_type], Filters=[{ "Name": "product-description", "Values": ["Linux/UNIX"] }], ) price = float(response["SpotPriceHistory"][0]["SpotPrice"]) num_gpus, gpu_type, mem, cpus = INSTANCE_GPU_MAP[instance_type] string = format_price_string(instance_type, round(price, 2), f"{num_gpus} x {gpu_type}", cpus, mem) print(string)
def ls_spot_gpu_prices(dry_run=False) -> None: if dry_run: return ec2 = CONFIG.get_session().client("ec2") string = AwsUtils._format_price_string("InstanceType", "Price ($/hour)", "GPUs", "CPUs", "Mem (Gb)") print() nprint(string, style="bold") for instance_type in AwsUtils._instance_types(): response = ec2.describe_spot_price_history( InstanceTypes=[instance_type], Filters=[{ "Name": "product-description", "Values": ["Linux/UNIX"] }], ) price = float(response["SpotPriceHistory"][0]["SpotPrice"]) num_gpus, gpu_type, mem, cpus = INSTANCE_GPU_MAP[instance_type] string = AwsUtils._format_price_string(instance_type, round(price, 2), f"{num_gpus} x {gpu_type}", cpus, mem) print(string) print()
def list_instance_profiles(dry_run=False): iam = CONFIG.get_session().client("iam") if dry_run: return response = iam.list_instance_profiles() pprint(response["InstanceProfiles"])
def stop_instance(instance_id, dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.stop_instances(InstanceIds=[instance_id], DryRun=dry_run) pprint(response) except ClientError as e: if "DryRunOperation" not in str(e): raise
def delete_instance(instance_id, dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) status = response["TerminatingInstances"][0]["CurrentState"]["Name"] print(f"Instance {instance_id}: {status}") except ClientError as e: if "DryRunOperation" not in str(e): raise
def delete_instance(instance_id: str, dry_run=False) -> None: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.terminate_instances(InstanceIds=[instance_id], DryRun=dry_run) status = response["TerminatingInstances"][0]["CurrentState"][ "Name"] nprint_header(f"Instance [green]{instance_id}[/green]: {status}") except botocore.exceptions.ClientError as e: if "DryRunOperation" not in str(e): raise
def check_instance_status(instance_id, dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances(InstanceIds=[instance_id], Filters=make_instance_filters(), DryRun=dry_run) status = response["Reservations"][0]["Instances"][0]["State"]["Name"] return status except ClientError as e: if "DryRunOperation" not in str(e): raise
def record_event(cmd): if not CONFIG.telemetry: return else: if not CONFIG.user_id: CONFIG.get_session() now = datetime.now() date_time = now.strftime("%Y-%m-%d-%H-%M-%S") data = { "user_id": CONFIG.user_id, "user_arn": CONFIG.user_arn, "cmd": cmd, "date": date_time, } try: requests.post(CONFIG.telemetry_url, data=json.dumps(data), timeout=2) except BaseException: pass
def create_instance_profile(role_name, dry_run=False): iam = CONFIG.get_session().client("iam") instance_profile_name = "NimboInstanceProfile" if dry_run: return iam.create_instance_profile(InstanceProfileName=instance_profile_name, Path="/") iam.add_role_to_instance_profile(InstanceProfileName=instance_profile_name, RoleName=role_name)
def ec2_instance_types(): """Yield all available EC2 instance types in region CONFIG.region_name""" describe_args = {} client = CONFIG.get_session().client("ec2") while True: describe_result = client.describe_instance_types(**describe_args) yield from [ i["InstanceType"] for i in describe_result["InstanceTypes"] ] if "NextToken" not in describe_result: break describe_args["NextToken"] = describe_result["NextToken"]
def list_snapshots(): # Retrieve the list of existing buckets ec2 = CONFIG.get_session().client("ec2") response = ec2.describe_snapshots( Filters=[{ "Name": "tag:created_by", "Values": ["nimbo"] }], MaxResults=100, ) return list(sorted(response["Snapshots"], key=lambda x: x["StartTime"]))
def get_instance_status(instance_id: str, dry_run=False) -> str: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( InstanceIds=[instance_id], Filters=AwsInstance._make_instance_filters(), DryRun=dry_run, ) status = response["Reservations"][0]["Instances"][0]["State"][ "Name"] return status except botocore.exceptions.ClientError as e: if "DryRunOperation" not in str(e): raise
def check_instance_host(instance_id, dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( InstanceIds=[instance_id], Filters=make_instance_filters(), DryRun=dry_run, ) host = response["Reservations"][0]["Instances"][0]["PublicIpAddress"] except ClientError as e: if "DryRunOperation" not in str(e): raise host = "random_host" return host
def _get_host_from_instance_id(instance_id: str, dry_run=False) -> str: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( InstanceIds=[instance_id], Filters=AwsInstance._make_instance_filters(), DryRun=dry_run, ) host = response["Reservations"][0]["Instances"][0][ "PublicIpAddress"] except botocore.exceptions.ClientError as e: if "DryRunOperation" not in str(e): raise host = "" return host
def run_commands_on_instance(commands, instance_ids): """Runs commands on remote linux instances :param commands: a list of strings, each one a command to execute on the instances :param instance_ids: a list of instance_id strings, of the instances on which to execute the command :return: the response from the send_command function (check the boto3 docs for ssm client.send_command() ) """ ssm = CONFIG.get_session().client("ssm") resp = ssm.send_command( DocumentName="AWS-RunShellScript", # One of AWS' preconfigured documents Parameters={"commands": commands}, InstanceIds=instance_ids, ) return resp
def create_security_group(group_name, dry_run=False): ec2 = CONFIG.get_session().client("ec2") response = ec2.describe_vpcs() vpc_id = response.get("Vpcs", [{}])[0].get("VpcId", "") response = ec2.create_security_group( GroupName=group_name, Description="Base VPC security group for Nimbo jobs.", VpcId=vpc_id, DryRun=dry_run, ) security_group_id = response["GroupId"] print( f"Security Group {group_name} (id={security_group_id}) Created in vpc {vpc_id}." )
def show_stopped_instances(dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( Filters=[{ "Name": "instance-state-name", "Values": ["stopped", "stopping"] }] + make_instance_filters(), DryRun=dry_run, ) for reservation in response["Reservations"]: for inst in reservation["Instances"]: print(f"ID: {inst['InstanceId']}\n" f"Launch Time: {inst['LaunchTime']}\n" f"InstanceType: {inst['InstanceType']}\n") except ClientError as e: if "DryRunOperation" not in str(e): raise
def _instance_types() -> Generator[str, None, None]: """Yield all relevant EC2 instance types in region CONFIG.region_name""" describe_args = {} client = CONFIG.get_session().client("ec2") def instance_type_generator(): while True: describe_result = client.describe_instance_types( **describe_args) yield from (i["InstanceType"] for i in describe_result["InstanceTypes"]) if "NextToken" not in describe_result: break describe_args["NextToken"] = describe_result["NextToken"] return (inst for inst in sorted(instance_type_generator()) if inst.startswith(("p2", "p3", "p4", "g4d")))
def allow_ingress_current_ip(target: str, dry_run=False) -> None: ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_security_groups(GroupNames=[target], DryRun=dry_run) security_group_id = response["SecurityGroups"][0]["GroupId"] except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "InvalidGroup.NotFound": nprint( f"Security group {target} not found. Please use an existing" " security group or create a new one in the AWS console.", style="error", ) sys.exit(1) elif e.response["Error"]["Code"] == "UnauthorizedOperation": return else: raise my_public_ip = requests.get( "https://checkip.amazonaws.com").text.strip() try: ec2.authorize_security_group_ingress( GroupId=security_group_id, IpPermissions=[{ "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [{ "CidrIp": f"{my_public_ip}/16" }], }], ) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "InvalidPermission.Duplicate": return elif e.response["Error"]["Code"] == "UnauthorizedOperation": return else: raise
def upload_file(file_name, bucket, object_name=None): """Upload a file to an S3 bucket :param file_name: File to upload :param bucket: Bucket to upload to :param object_name: S3 object name. If not specified then file_name is used :return: True if file was uploaded, else False """ # If S3 object_name was not specified, use file_name if object_name is None: object_name = file_name # Upload the file s3 = CONFIG.get_session().client("s3") try: s3.upload_file(file_name, bucket, object_name) except ClientError as e: logging.error(e) return False return True
def show_active_instances(dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( Filters=[{ "Name": "instance-state-name", "Values": ["running", "pending"] }] + make_instance_filters(), DryRun=dry_run, ) for reservation in response["Reservations"]: for inst in reservation["Instances"]: print(f"Id: {inst['InstanceId']}\n" f"Status: {inst['State']['Name']}\n" f"Launch Time: {inst['LaunchTime']}\n" f"InstanceType: {inst['InstanceType']}\n" f"IP Address: {inst['PublicIpAddress']}\n") except ClientError as e: if "DryRunOperation" not in str(e): raise
def delete_all_instances(dry_run=False): ec2 = CONFIG.get_session().client("ec2") try: response = ec2.describe_instances( Filters=[{ "Name": "instance-state-name", "Values": ["running"] }] + make_instance_filters(), DryRun=dry_run, ) for reservation in response["Reservations"]: for inst in reservation["Instances"]: instance_id = inst["InstanceId"] delete_response = ec2.terminate_instances( InstanceIds=[instance_id], ) status = delete_response["TerminatingInstances"][0][ "CurrentState"]["Name"] print(f"Instance {instance_id}: {status}") except ClientError as e: if "DryRunOperation" not in str(e): raise
def create_bucket(bucket_name, dry_run=False): """Create an S3 bucket in a specified region :param bucket_name: Bucket to create :param dry_run :return: True if bucket created, else False """ try: session = CONFIG.get_session() s3 = session.client("s3") location = {"LocationConstraint": session.region_name} s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location) except ClientError as e: if e.response["Error"]["Code"] == "BucketAlreadyOwnedByYou": print("Bucket nimbo-main-bucket already exists.") else: logging.error(e) return False print("Bucket %s created." % bucket_name) return True
def mk_bucket(bucket_name: str, dry_run=False) -> None: """Create an S3 bucket in a specified region :param bucket_name: Bucket to create :param dry_run :return: True if bucket created, else False """ try: session = CONFIG.get_session() s3 = session.client("s3") location = {"LocationConstraint": session.region_name} s3.create_bucket(Bucket=bucket_name, CreateBucketConfiguration=location) except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "BucketAlreadyOwnedByYou": nprint("Bucket nimbo-main-bucket already exists.", style="warning") else: nprint(e, style="error") return print("Bucket %s created." % bucket_name)
def allow_inbound_current_ip(group_name, dry_run=False): ec2 = CONFIG.get_session().client("ec2") # Get the security group id response = ec2.describe_security_groups(GroupNames=[group_name], DryRun=dry_run) security_group_id = response["SecurityGroups"][0]["GroupId"] my_public_ip = requests.get("https://checkip.amazonaws.com").text.strip() response = ec2.authorize_security_group_ingress( GroupId=security_group_id, IpPermissions=[{ "IpProtocol": "tcp", "FromPort": 22, "ToPort": 22, "IpRanges": [{ "CidrIp": f"{my_public_ip}/16" }], }], ) print("Ingress Successfully Set") pprint(response)
def run_access_test(dry_run=False): if dry_run: return CONFIG.instance_type = "t3.medium" CONFIG.run_in_background = False CONFIG.persist = False try: # Send test file to s3 results path and delete it profile = CONFIG.aws_profile region = CONFIG.region_name results_path = CONFIG.s3_results_path subprocess.check_output( "echo 'Hello World' > nimbo-access-test.txt", shell=True ) command = s3_cp_command("nimbo-access-test.txt", results_path) subprocess.check_output(command, shell=True) command = f"aws s3 ls {results_path} --profile {profile} --region {region}" subprocess.check_output(command, shell=True) command = ( f"aws s3 rm {results_path}/nimbo-access-test.txt " f"--profile {profile} --region {region}" ) subprocess.check_output(command, shell=True) print( "You have the necessary S3 read/write permissions from your computer \u2713" ) except subprocess.CalledProcessError: print("\nError.") sys.exit(1) # access.verify_nimbo_instance_profile(session) # print("Instance profile 'NimboInstanceProfile' found \u2713") # Launch instance with new volume for anaconda print("Launching test instance... ", end="", flush=True) ec2 = CONFIG.get_session().client("ec2") instance = launch_instance(ec2) instance_id = instance["InstanceId"] try: # Wait for the instance to be running wait_for_instance_running(instance_id) print(f"Instance running. Instance creation allowed \u2713") print(f"InstanceId: {instance_id}") print() print("Trying to delete this instance...") utils.delete_instance(instance_id) print("Instance deletion allowed \u2713") print("\nLaunching another instance...") instance = launch_instance(ec2) instance_id = instance["InstanceId"] print(f"Instance running. InstanceId: {instance_id}") time.sleep(5) host = utils.check_instance_host(instance_id) ssh = ( f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no' " "-o ServerAliveInterval=20" ) scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" block_until_ssh_ready(host) print("Instance key allows ssh access to remote instance \u2713") print("Security group allows ssh access to remote instance \u2713") write_nimbo_vars() subprocess.check_output( f"{scp} {CONFIG.nimbo_config_file} {NIMBO_VARS} ubuntu@{host}:/home/ubuntu/", shell=True, ) run_remote_script(ssh, scp, host, instance_id, "", "remote_s3_test.sh") print("The instance profile has the required S3 and EC2 permissions \u2713") print("\nEverything working \u2713") print("Instance has been deleted.") except BaseException as e: if type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError: print(e) if not CONFIG.persist: print(f"Deleting instance {instance_id} (from local)...") utils.delete_instance(instance_id) sys.exit(1)
def run_job(job_cmd, dry_run=False): if dry_run: return {"message": job_cmd + "_dry_run"} # access.verify_nimbo_instance_profile(session) # Launch instance with new volume for anaconda print("Launching instance... ", end="", flush=True) ec2 = CONFIG.get_session().client("ec2") telemetry.record_event("run") start_t = time.monotonic() instance = launch_instance(ec2) instance_id = instance["InstanceId"] try: # Wait for the instance to be running wait_for_instance_running(instance_id) end_t = time.monotonic() print(f"Instance running. ({round((end_t-start_t), 2)}s)") print(f"InstanceId: {instance_id}") print() time.sleep(5) host = utils.check_instance_host(instance_id) block_until_ssh_ready(host) if job_cmd == "_nimbo_launch": print(f"Run 'nimbo ssh {instance_id}' to log onto the instance") return {"message": job_cmd + "_success", "instance_id": instance_id} ssh = ( f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" " -o ServerAliveInterval=20 " ) scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'" local_env = "/tmp/local_env.yml" user_conda_yml = CONFIG.conda_env # TODO: Replace this with shutil subprocess.check_output(f"cp {user_conda_yml} {local_env}", shell=True) # Send conda env yaml and setup scripts to instance print("\nSyncing conda, config, and setup files...") write_nimbo_vars() # Create project folder and send env and config files there subprocess.check_output(f"{ssh} ubuntu@{host} mkdir project", shell=True) subprocess.check_output( f"{scp} {local_env} {CONFIG.nimbo_config_file} {NIMBO_VARS}" f" ubuntu@{host}:/home/ubuntu/project/", shell=True, ) # Sync code with instance print("\nSyncing code...") sync_code(host) # Run remote_setup script on instance run_remote_script(ssh, scp, host, instance_id, job_cmd, "remote_setup.sh") return {"message": job_cmd + "_success", "instance_id": instance_id} except BaseException as e: if type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError: print(e) if not CONFIG.persist: print(f"Deleting instance {instance_id} (from local)...") utils.delete_instance(instance_id) return {"message": job_cmd + "_error", "instance_id": instance_id}