Example #1
0
def create_instance_profile_and_role(dry_run=False):
    iam = CONFIG.get_session().client("iam")
    role_name = "NimboS3AndEC2FullAccess"
    instance_profile_name = "NimboInstanceProfile"

    policy = {
        "Version": "2012-10-17",
        "Statement": {
            "Effect": "Allow",
            "Action": "sts:AssumeRole",
            "Principal": {
                "Service": "ec2.amazonaws.com"
            },
        },
    }
    if dry_run:
        return
    iam.create_role(RoleName=role_name,
                    AssumeRolePolicyDocument=json.dumps(policy))
    iam.attach_role_policy(
        PolicyArn="arn:aws:iam::aws:policy/AmazonS3FullAccess",
        RoleName=role_name)
    iam.attach_role_policy(
        PolicyArn="arn:aws:iam::aws:policy/AmazonEC2FullAccess",
        RoleName=role_name)

    iam.create_instance_profile(InstanceProfileName=instance_profile_name,
                                Path="/")
    iam.add_role_to_instance_profile(InstanceProfileName=instance_profile_name,
                                     RoleName=role_name)
Example #2
0
def verify_nimbo_instance_profile(dry_run=False):
    iam = CONFIG.get_session().client("iam")

    if dry_run:
        return

    response = iam.list_instance_profiles()
    instance_profiles = response["InstanceProfiles"]
    instance_profile_names = [
        p["InstanceProfileName"] for p in instance_profiles
    ]
    if "NimboInstanceProfiles" not in instance_profile_names:
        raise Exception(
            textwrap.dedent(
                """Instance profile 'NimboInstanceProfile' not found.

                An instance profile is necessary to give your instance access
                to EC2 and S3 resources. You can create an instance profile using
                'nimbo create_instance_profile <role_name>'. If you are a root user,
                you can simply run 'nimbo create_instance_profile_and_role', and
                nimbo will create the necessary role policies and instance profile
                for you. Otherwise, please ask your admin for a role that provides
                the necessary EC2 and S3 read/write access.

                For more details please go to docs.nimbo.sh/instance-profiles."
                """))
Example #3
0
def list_buckets():
    s3 = CONFIG.get_session().client("s3")
    response = s3.list_buckets()

    print("Existing buckets:")
    for bucket in response["Buckets"]:
        print(f' {bucket["Name"]}')
Example #4
0
def list_spot_gpu_prices(dry_run=False):
    if dry_run:
        return

    instance_types = list(sorted(ec2_instance_types()))
    instance_types = [
        inst for inst in instance_types
        if inst[:2] in ["p2", "p3", "p4"] or inst[:3] in ["g4d"]
    ]

    ec2 = CONFIG.get_session().client("ec2")

    string = format_price_string("InstanceType", "Price ($/hour)", "GPUs",
                                 "CPUs", "Mem (Gb)")
    print(string)

    for instance_type in instance_types:
        response = ec2.describe_spot_price_history(
            InstanceTypes=[instance_type],
            Filters=[{
                "Name": "product-description",
                "Values": ["Linux/UNIX"]
            }],
        )

        price = float(response["SpotPriceHistory"][0]["SpotPrice"])

        num_gpus, gpu_type, mem, cpus = INSTANCE_GPU_MAP[instance_type]
        string = format_price_string(instance_type, round(price, 2),
                                     f"{num_gpus} x {gpu_type}", cpus, mem)
        print(string)
Example #5
0
    def ls_spot_gpu_prices(dry_run=False) -> None:
        if dry_run:
            return

        ec2 = CONFIG.get_session().client("ec2")

        string = AwsUtils._format_price_string("InstanceType",
                                               "Price ($/hour)", "GPUs",
                                               "CPUs", "Mem (Gb)")
        print()
        nprint(string, style="bold")

        for instance_type in AwsUtils._instance_types():
            response = ec2.describe_spot_price_history(
                InstanceTypes=[instance_type],
                Filters=[{
                    "Name": "product-description",
                    "Values": ["Linux/UNIX"]
                }],
            )

            price = float(response["SpotPriceHistory"][0]["SpotPrice"])

            num_gpus, gpu_type, mem, cpus = INSTANCE_GPU_MAP[instance_type]
            string = AwsUtils._format_price_string(instance_type,
                                                   round(price, 2),
                                                   f"{num_gpus} x {gpu_type}",
                                                   cpus, mem)
            print(string)
        print()
Example #6
0
def list_instance_profiles(dry_run=False):
    iam = CONFIG.get_session().client("iam")

    if dry_run:
        return
    response = iam.list_instance_profiles()
    pprint(response["InstanceProfiles"])
Example #7
0
def stop_instance(instance_id, dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.stop_instances(InstanceIds=[instance_id],
                                      DryRun=dry_run)
        pprint(response)
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #8
0
def delete_instance(instance_id, dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.terminate_instances(InstanceIds=[instance_id],
                                           DryRun=dry_run)
        status = response["TerminatingInstances"][0]["CurrentState"]["Name"]
        print(f"Instance {instance_id}: {status}")
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #9
0
 def delete_instance(instance_id: str, dry_run=False) -> None:
     ec2 = CONFIG.get_session().client("ec2")
     try:
         response = ec2.terminate_instances(InstanceIds=[instance_id],
                                            DryRun=dry_run)
         status = response["TerminatingInstances"][0]["CurrentState"][
             "Name"]
         nprint_header(f"Instance [green]{instance_id}[/green]: {status}")
     except botocore.exceptions.ClientError as e:
         if "DryRunOperation" not in str(e):
             raise
Example #10
0
def check_instance_status(instance_id, dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.describe_instances(InstanceIds=[instance_id],
                                          Filters=make_instance_filters(),
                                          DryRun=dry_run)
        status = response["Reservations"][0]["Instances"][0]["State"]["Name"]
        return status
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #11
0
def record_event(cmd):
    if not CONFIG.telemetry:
        return
    else:
        if not CONFIG.user_id:
            CONFIG.get_session()

    now = datetime.now()
    date_time = now.strftime("%Y-%m-%d-%H-%M-%S")

    data = {
        "user_id": CONFIG.user_id,
        "user_arn": CONFIG.user_arn,
        "cmd": cmd,
        "date": date_time,
    }
    try:
        requests.post(CONFIG.telemetry_url, data=json.dumps(data), timeout=2)
    except BaseException:
        pass
Example #12
0
def create_instance_profile(role_name, dry_run=False):
    iam = CONFIG.get_session().client("iam")
    instance_profile_name = "NimboInstanceProfile"

    if dry_run:
        return

    iam.create_instance_profile(InstanceProfileName=instance_profile_name,
                                Path="/")
    iam.add_role_to_instance_profile(InstanceProfileName=instance_profile_name,
                                     RoleName=role_name)
Example #13
0
def ec2_instance_types():
    """Yield all available EC2 instance types in region CONFIG.region_name"""
    describe_args = {}
    client = CONFIG.get_session().client("ec2")
    while True:
        describe_result = client.describe_instance_types(**describe_args)
        yield from [
            i["InstanceType"] for i in describe_result["InstanceTypes"]
        ]
        if "NextToken" not in describe_result:
            break
        describe_args["NextToken"] = describe_result["NextToken"]
Example #14
0
def list_snapshots():
    # Retrieve the list of existing buckets
    ec2 = CONFIG.get_session().client("ec2")

    response = ec2.describe_snapshots(
        Filters=[{
            "Name": "tag:created_by",
            "Values": ["nimbo"]
        }],
        MaxResults=100,
    )
    return list(sorted(response["Snapshots"], key=lambda x: x["StartTime"]))
Example #15
0
 def get_instance_status(instance_id: str, dry_run=False) -> str:
     ec2 = CONFIG.get_session().client("ec2")
     try:
         response = ec2.describe_instances(
             InstanceIds=[instance_id],
             Filters=AwsInstance._make_instance_filters(),
             DryRun=dry_run,
         )
         status = response["Reservations"][0]["Instances"][0]["State"][
             "Name"]
         return status
     except botocore.exceptions.ClientError as e:
         if "DryRunOperation" not in str(e):
             raise
Example #16
0
def check_instance_host(instance_id, dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.describe_instances(
            InstanceIds=[instance_id],
            Filters=make_instance_filters(),
            DryRun=dry_run,
        )
        host = response["Reservations"][0]["Instances"][0]["PublicIpAddress"]
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
        host = "random_host"
    return host
Example #17
0
 def _get_host_from_instance_id(instance_id: str, dry_run=False) -> str:
     ec2 = CONFIG.get_session().client("ec2")
     try:
         response = ec2.describe_instances(
             InstanceIds=[instance_id],
             Filters=AwsInstance._make_instance_filters(),
             DryRun=dry_run,
         )
         host = response["Reservations"][0]["Instances"][0][
             "PublicIpAddress"]
     except botocore.exceptions.ClientError as e:
         if "DryRunOperation" not in str(e):
             raise
         host = ""
     return host
Example #18
0
def run_commands_on_instance(commands, instance_ids):
    """Runs commands on remote linux instances
    :param commands: a list of strings, each one a command to execute on the instances
    :param instance_ids: a list of instance_id strings, of the instances on which
                         to execute the command
    :return: the response from the send_command function (check the boto3 docs
             for ssm client.send_command() )
    """

    ssm = CONFIG.get_session().client("ssm")
    resp = ssm.send_command(
        DocumentName="AWS-RunShellScript",  # One of AWS' preconfigured documents
        Parameters={"commands": commands},
        InstanceIds=instance_ids,
    )
    return resp
Example #19
0
def create_security_group(group_name, dry_run=False):

    ec2 = CONFIG.get_session().client("ec2")
    response = ec2.describe_vpcs()
    vpc_id = response.get("Vpcs", [{}])[0].get("VpcId", "")

    response = ec2.create_security_group(
        GroupName=group_name,
        Description="Base VPC security group for Nimbo jobs.",
        VpcId=vpc_id,
        DryRun=dry_run,
    )

    security_group_id = response["GroupId"]
    print(
        f"Security Group {group_name} (id={security_group_id}) Created in vpc {vpc_id}."
    )
Example #20
0
def show_stopped_instances(dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.describe_instances(
            Filters=[{
                "Name": "instance-state-name",
                "Values": ["stopped", "stopping"]
            }] + make_instance_filters(),
            DryRun=dry_run,
        )
        for reservation in response["Reservations"]:
            for inst in reservation["Instances"]:
                print(f"ID: {inst['InstanceId']}\n"
                      f"Launch Time: {inst['LaunchTime']}\n"
                      f"InstanceType: {inst['InstanceType']}\n")
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #21
0
    def _instance_types() -> Generator[str, None, None]:
        """Yield all relevant EC2 instance types in region CONFIG.region_name"""

        describe_args = {}
        client = CONFIG.get_session().client("ec2")

        def instance_type_generator():
            while True:
                describe_result = client.describe_instance_types(
                    **describe_args)
                yield from (i["InstanceType"]
                            for i in describe_result["InstanceTypes"])
                if "NextToken" not in describe_result:
                    break
                describe_args["NextToken"] = describe_result["NextToken"]

        return (inst for inst in sorted(instance_type_generator())
                if inst.startswith(("p2", "p3", "p4", "g4d")))
Example #22
0
    def allow_ingress_current_ip(target: str, dry_run=False) -> None:
        ec2 = CONFIG.get_session().client("ec2")

        try:
            response = ec2.describe_security_groups(GroupNames=[target],
                                                    DryRun=dry_run)
            security_group_id = response["SecurityGroups"][0]["GroupId"]
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "InvalidGroup.NotFound":
                nprint(
                    f"Security group {target} not found. Please use an existing"
                    " security group or create a new one in the AWS console.",
                    style="error",
                )
                sys.exit(1)
            elif e.response["Error"]["Code"] == "UnauthorizedOperation":
                return
            else:
                raise

        my_public_ip = requests.get(
            "https://checkip.amazonaws.com").text.strip()

        try:
            ec2.authorize_security_group_ingress(
                GroupId=security_group_id,
                IpPermissions=[{
                    "IpProtocol": "tcp",
                    "FromPort": 22,
                    "ToPort": 22,
                    "IpRanges": [{
                        "CidrIp": f"{my_public_ip}/16"
                    }],
                }],
            )
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "InvalidPermission.Duplicate":
                return
            elif e.response["Error"]["Code"] == "UnauthorizedOperation":
                return
            else:
                raise
Example #23
0
def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3 = CONFIG.get_session().client("s3")
    try:
        s3.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True
Example #24
0
def show_active_instances(dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.describe_instances(
            Filters=[{
                "Name": "instance-state-name",
                "Values": ["running", "pending"]
            }] + make_instance_filters(),
            DryRun=dry_run,
        )
        for reservation in response["Reservations"]:
            for inst in reservation["Instances"]:
                print(f"Id: {inst['InstanceId']}\n"
                      f"Status: {inst['State']['Name']}\n"
                      f"Launch Time: {inst['LaunchTime']}\n"
                      f"InstanceType: {inst['InstanceType']}\n"
                      f"IP Address: {inst['PublicIpAddress']}\n")

    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #25
0
def delete_all_instances(dry_run=False):
    ec2 = CONFIG.get_session().client("ec2")
    try:
        response = ec2.describe_instances(
            Filters=[{
                "Name": "instance-state-name",
                "Values": ["running"]
            }] + make_instance_filters(),
            DryRun=dry_run,
        )
        for reservation in response["Reservations"]:
            for inst in reservation["Instances"]:
                instance_id = inst["InstanceId"]
                delete_response = ec2.terminate_instances(
                    InstanceIds=[instance_id], )
                status = delete_response["TerminatingInstances"][0][
                    "CurrentState"]["Name"]
                print(f"Instance {instance_id}: {status}")
    except ClientError as e:
        if "DryRunOperation" not in str(e):
            raise
Example #26
0
def create_bucket(bucket_name, dry_run=False):
    """Create an S3 bucket in a specified region

    :param bucket_name: Bucket to create
    :param dry_run
    :return: True if bucket created, else False
    """

    try:
        session = CONFIG.get_session()
        s3 = session.client("s3")
        location = {"LocationConstraint": session.region_name}
        s3.create_bucket(Bucket=bucket_name,
                         CreateBucketConfiguration=location)
    except ClientError as e:
        if e.response["Error"]["Code"] == "BucketAlreadyOwnedByYou":
            print("Bucket nimbo-main-bucket already exists.")
        else:
            logging.error(e)
        return False

    print("Bucket %s created." % bucket_name)
    return True
Example #27
0
    def mk_bucket(bucket_name: str, dry_run=False) -> None:
        """Create an S3 bucket in a specified region

        :param bucket_name: Bucket to create
        :param dry_run
        :return: True if bucket created, else False
        """

        try:
            session = CONFIG.get_session()
            s3 = session.client("s3")
            location = {"LocationConstraint": session.region_name}
            s3.create_bucket(Bucket=bucket_name,
                             CreateBucketConfiguration=location)
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "BucketAlreadyOwnedByYou":
                nprint("Bucket nimbo-main-bucket already exists.",
                       style="warning")
            else:
                nprint(e, style="error")
            return

        print("Bucket %s created." % bucket_name)
Example #28
0
def allow_inbound_current_ip(group_name, dry_run=False):

    ec2 = CONFIG.get_session().client("ec2")

    # Get the security group id
    response = ec2.describe_security_groups(GroupNames=[group_name],
                                            DryRun=dry_run)
    security_group_id = response["SecurityGroups"][0]["GroupId"]

    my_public_ip = requests.get("https://checkip.amazonaws.com").text.strip()

    response = ec2.authorize_security_group_ingress(
        GroupId=security_group_id,
        IpPermissions=[{
            "IpProtocol": "tcp",
            "FromPort": 22,
            "ToPort": 22,
            "IpRanges": [{
                "CidrIp": f"{my_public_ip}/16"
            }],
        }],
    )
    print("Ingress Successfully Set")
    pprint(response)
Example #29
0
def run_access_test(dry_run=False):
    if dry_run:
        return

    CONFIG.instance_type = "t3.medium"
    CONFIG.run_in_background = False
    CONFIG.persist = False

    try:
        # Send test file to s3 results path and delete it
        profile = CONFIG.aws_profile
        region = CONFIG.region_name
        results_path = CONFIG.s3_results_path

        subprocess.check_output(
            "echo 'Hello World' > nimbo-access-test.txt", shell=True
        )

        command = s3_cp_command("nimbo-access-test.txt", results_path)
        subprocess.check_output(command, shell=True)

        command = f"aws s3 ls {results_path} --profile {profile} --region {region}"
        subprocess.check_output(command, shell=True)
        command = (
            f"aws s3 rm {results_path}/nimbo-access-test.txt "
            f"--profile {profile} --region {region}"
        )
        subprocess.check_output(command, shell=True)

        print(
            "You have the necessary S3 read/write permissions from your computer \u2713"
        )

    except subprocess.CalledProcessError:
        print("\nError.")
        sys.exit(1)

    # access.verify_nimbo_instance_profile(session)
    # print("Instance profile 'NimboInstanceProfile' found \u2713")

    # Launch instance with new volume for anaconda
    print("Launching test instance... ", end="", flush=True)
    ec2 = CONFIG.get_session().client("ec2")

    instance = launch_instance(ec2)
    instance_id = instance["InstanceId"]

    try:
        # Wait for the instance to be running
        wait_for_instance_running(instance_id)
        print(f"Instance running. Instance creation allowed \u2713")
        print(f"InstanceId: {instance_id}")
        print()

        print("Trying to delete this instance...")
        utils.delete_instance(instance_id)

        print("Instance deletion allowed \u2713")
        print("\nLaunching another instance...")
        instance = launch_instance(ec2)
        instance_id = instance["InstanceId"]
        print(f"Instance running. InstanceId: {instance_id}")

        time.sleep(5)
        host = utils.check_instance_host(instance_id)
        ssh = (
            f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no' "
            "-o ServerAliveInterval=20"
        )
        scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'"

        block_until_ssh_ready(host)

        print("Instance key allows ssh access to remote instance \u2713")
        print("Security group allows ssh access to remote instance \u2713")

        write_nimbo_vars()

        subprocess.check_output(
            f"{scp} {CONFIG.nimbo_config_file} {NIMBO_VARS} ubuntu@{host}:/home/ubuntu/",
            shell=True,
        )
        run_remote_script(ssh, scp, host, instance_id, "", "remote_s3_test.sh")
        print("The instance profile has the required S3 and EC2 permissions \u2713")

        print("\nEverything working \u2713")
        print("Instance has been deleted.")

    except BaseException as e:
        if type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError:
            print(e)

        if not CONFIG.persist:
            print(f"Deleting instance {instance_id} (from local)...")
            utils.delete_instance(instance_id)

        sys.exit(1)
Example #30
0
def run_job(job_cmd, dry_run=False):
    if dry_run:
        return {"message": job_cmd + "_dry_run"}

    # access.verify_nimbo_instance_profile(session)

    # Launch instance with new volume for anaconda
    print("Launching instance... ", end="", flush=True)
    ec2 = CONFIG.get_session().client("ec2")
    telemetry.record_event("run")

    start_t = time.monotonic()

    instance = launch_instance(ec2)
    instance_id = instance["InstanceId"]

    try:
        # Wait for the instance to be running
        wait_for_instance_running(instance_id)
        end_t = time.monotonic()
        print(f"Instance running. ({round((end_t-start_t), 2)}s)")
        print(f"InstanceId: {instance_id}")
        print()

        time.sleep(5)
        host = utils.check_instance_host(instance_id)

        block_until_ssh_ready(host)

        if job_cmd == "_nimbo_launch":
            print(f"Run 'nimbo ssh {instance_id}' to log onto the instance")
            return {"message": job_cmd + "_success", "instance_id": instance_id}

        ssh = (
            f"ssh -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'"
            " -o ServerAliveInterval=20 "
        )
        scp = f"scp -i {CONFIG.instance_key} -o 'StrictHostKeyChecking no'"

        local_env = "/tmp/local_env.yml"
        user_conda_yml = CONFIG.conda_env
        # TODO: Replace this with shutil
        subprocess.check_output(f"cp {user_conda_yml} {local_env}", shell=True)

        # Send conda env yaml and setup scripts to instance
        print("\nSyncing conda, config, and setup files...")
        write_nimbo_vars()

        # Create project folder and send env and config files there
        subprocess.check_output(f"{ssh} ubuntu@{host} mkdir project", shell=True)
        subprocess.check_output(
            f"{scp} {local_env} {CONFIG.nimbo_config_file} {NIMBO_VARS}"
            f" ubuntu@{host}:/home/ubuntu/project/",
            shell=True,
        )

        # Sync code with instance
        print("\nSyncing code...")
        sync_code(host)

        # Run remote_setup script on instance
        run_remote_script(ssh, scp, host, instance_id, job_cmd, "remote_setup.sh")

        return {"message": job_cmd + "_success", "instance_id": instance_id}

    except BaseException as e:
        if type(e) != KeyboardInterrupt and type(e) != subprocess.CalledProcessError:
            print(e)

        if not CONFIG.persist:
            print(f"Deleting instance {instance_id} (from local)...")
            utils.delete_instance(instance_id)

        return {"message": job_cmd + "_error", "instance_id": instance_id}