Exemple #1
0
def grow_ebs_for_task(task_fragment, target_size_gb):
    """Grows EBS volume for given task."""

    ec2 = u.create_ec2_resource()
    client = u.create_ec2_client()

    # todo: don't crash on missing/duplicate names
    instances = {u.get_name(i.tags): i for i in ec2.instances.all()}

    ec2 = u.create_ec2_resource()
    instances = [(u.seconds_from_datetime(i.launch_time), i)
                 for i in ec2.instances.all()]
    sorted_instances = sorted(instances, key=itemgetter(0))

    for (seconds, instance) in sorted_instances:
        task_name = u.get_name(instance.tags)
        hours_ago = (time.time() - seconds) / 3600
        hours_ago += 8  # adjust for time being in UTC

        if task_fragment in task_name:
            print("Found instance %s launched %.1f hours ago" %
                  (task_name, hours_ago))
            break
    print(instance.id)

    volumes = list(instance.volumes.all())
    assert len(volumes) == 1, "Must have 1 volume"

    print("Growing %s to %s" % (volumes[0].id, target_size_gb))
    response = client.modify_volume(
        VolumeId=volumes[0].id,
        Size=target_size_gb,
    )
    assert u.is_good_response(response)
Exemple #2
0
def keypair_setup():
  """Creates keypair if necessary, saves private key locally, returns contents
  of private key file."""
  
  
  existing_keypairs = u.get_keypair_dict()
  keypair = existing_keypairs.get(KEYPAIR_NAME, None)
  keypair_fn = u.get_keypair_fn(KEYPAIR_NAME)
  if keypair:
    print("Reusing keypair "+KEYPAIR_NAME)
    # check that local pem file exists and is readable
    assert os.path.exists(keypair_fn)
    keypair_contents = open(keypair_fn).read()
    assert len(keypair_contents)>0
    # todo: check that fingerprint matches keypair.key_fingerprint
    return keypair
  
  print("Creating keypair "+KEYPAIR_NAME)
  ec2 = u.create_ec2_resource()
  keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME)
  assert not os.path.exists(keypair_fn), "previous, keypair exists, delete it with 'sudo rm %s'"%(keypair_fn)
  
  open(keypair_fn, 'w').write(keypair.key_material)
  os.system('chmod 400 '+keypair_fn)
  return keypair
def keypair_setup():
    """Creates keypair if necessary, saves private key locally, returns contents
  of private key file."""

    os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION)

    keypair = u.get_keypair_dict().get(KEYPAIR_NAME, None)
    keypair_fn = u.get_keypair_fn()
    if keypair:
        print("Reusing keypair " + KEYPAIR_NAME)
        # check that local pem file exists and is readable
        assert os.path.exists(
            keypair_fn
        ), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % (
            KEYPAIR_NAME, keypair_fn, KEYPAIR_NAME)
        keypair_contents = open(keypair_fn).read()
        assert len(keypair_contents) > 0
        # todo: check that fingerprint matches keypair.key_fingerprint
    else:
        print("Creating keypair " + KEYPAIR_NAME)
        ec2 = u.create_ec2_resource()
        assert not os.path.exists(
            keypair_fn
        ), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % (
            keypair_fn)
        keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME)

        open(keypair_fn, 'w').write(keypair.key_material)
        os.system('chmod 400 ' + keypair_fn)

    return keypair
def main():
  ec2 = u.create_ec2_resource()
  assert not args.snapshot, "Switched to snapshot_desc"

  if not args.zone:
    assert 'zone' in os.environ, 'must specify --zone or $zone'
    args.zone = os.environ['zone']

  snapshots = []
  for snap in ec2.snapshots.filter(OwnerIds=['self']):
    if args.snapshot_desc in snap.description:
      snapshots.append(snap)

  assert len(snapshots)>0, f"no snapshot matching {args.snapshot_desc}"
  assert len(snapshots)<2, f"multiple snapshots matching {args.snapshot_desc}"
  snap = snapshots[0]
  if not args.size_gb:
    args.size_gb = snap.volume_size
    
  print(f"Making {args.replicas} {args.size_gb} GB replicas in {args.zone}")
  
  for i in range(args.volume_offset, args.replicas+args.volume_offset):
    vol_name = 'imagenet_%02d'%(i)

    vol = ec2.create_volume(Size=args.size_gb, VolumeType='io1',
                      TagSpecifications=create_tags(vol_name),
                      AvailabilityZone=args.zone,
                      SnapshotId=snap.id,
                            Iops=args.iops)
    print(f"Creating {vol_name} {vol.id}")
Exemple #5
0
def main():
    ec2 = u.create_ec2_resource()  # ec2 resource
    ec2_client = u.create_ec2_client()  # ec2 client
    instances = list(ec2.instances.all())  # todo: use filter?
    region = u.get_region()

    instances_to_kill = []
    for i in instances:
        name = u.get_name(i.tags)
        state = i.state['Name']
        if not fragment in name:
            continue
        if args.skip_tensorboard and '.tb.' in name:
            continue
        if args.skip_stopped and state == 'stopped':
            continue
        if args.limit_to_key and not (USER_KEY_NAME in i.key_name):
            continue
        if state == 'terminated':
            continue
        instances_to_kill.append(i)
        print(u.get_name(i), i.instance_type, i.key_name,
              state if state == 'stopped' else '')

    # print extra info if couldn't find anything to kill
    if not instances_to_kill:
        valid_names = sorted(
            list(
                set("%s,%s" % (u.get_name(i), u.get_state(i))
                    for i in instances)))
        from pprint import pprint as pp
        print("Current instances:")
        pp(valid_names)
        print("No running instances found for: Name '%s', key '%s'" %
              (fragment, USER_KEY_NAME))
        if args.skip_tensorboard:
            print("skipping tensorboard")
        return

    action = 'soft terminate' if args.soft else 'terminate'
    if args.yes:
        answer = 'y'
    else:
        answer = input("%d instances found, %s in %s? (y/N) " %
                       (len(instances_to_kill), action, region))
    if not answer:
        answer = "n"
    if answer.lower() == "y" or args.yes:
        instance_ids = [i.id for i in instances_to_kill]
        if args.delay:
            print(f"Sleeping for {args.delay} seconds")
            time.sleep(args.delay)
        if args.soft:
            response = ec2_client.stop_instances(InstanceIds=instance_ids)
            print("soft terminating, got response: %s", response)
        else:
            response = ec2_client.terminate_instances(InstanceIds=instance_ids)
            print("terminating, got response: %s", response)
    else:
        print("Didn't get y, doing nothing")
Exemple #6
0
def list_spot_requests():
    ec2 = u.create_ec2_resource()
    client = u.create_ec2_client()
    for request in client.describe_spot_instance_requests(
    )['SpotInstanceRequests']:
        launch_spec = request['LaunchSpecification']
        print(request['SpotInstanceRequestId'], launch_spec['InstanceType'],
              launch_spec['KeyName'], request['State'])
Exemple #7
0
def attach_instance_ebs(aws_instance, tag, unix_device=u.DEFAULT_UNIX_DEVICE):
    """Attaches volume to instance. Will try to detach volume if it's already mounted somewhere else. Will retry indefinitely on error."""

    ec2 = u.create_ec2_resource()
    v = list(
        ec2.volumes.filter(Filters=[{
            'Name': 'tag:Name',
            'Values': [tag]
        }, {
            "Name": "availability-zone",
            'Values': [os.environ['zone']]
        }]).all())
    assert (v), f"Volume {tag} not found."
    v = v[0]
    volume_name = u.get_name(v)
    already_attached = v.attachments and v.attachments[0][
        'InstanceId'] == aws_instance.id
    instance_name = u.get_name(aws_instance)
    # TODO: still have edge case when it doesn't report as already attached
    # and keeps trying to attach to an instance that has data mounted already
    if already_attached:
        print(
            f'volume {volume_name} ({v.id}) already attached to {instance_name}'
        )
        return
    while v.state != 'available':
        response = v.detach_from_instance()
        instance_id = v.attachments[0]['InstanceId']
        instance_name = u.get_name(instance_id)
        print(
            f'Volume {tag} is attached to {instance_name}, detaching, response={response.get("State", "none")}'
        )
        time.sleep(ATTACH_WAIT_INTERVAL_SEC)
        v.reload()
    while True:
        try:
            response = v.attach_to_instance(InstanceId=aws_instance.id,
                                            Device=unix_device)
            print(
                f'Attaching {volume_name} to {instance_name}: response={response.get("State", "none")}'
            )

        # sometimes have unrecoverable failure on brand new instance with
        # possibly because of https://forums.aws.amazon.com/thread.jspa?threadID=66192
        #    Error attaching volume: (An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use). Retrying in 5 An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use

        except Exception as e:
            print(f"Failed attaching ({v.id}) to ({aws_instance.id})")
            print(
                f'Error attaching volume: ({e}). Retrying in {ATTACH_WAIT_INTERVAL_SEC}',
                e)
            time.sleep(ATTACH_WAIT_INTERVAL_SEC)
            continue
        else:
            print('Attachment successful')
            break
Exemple #8
0
def main():
    fragment = args.fragment

    # TODO: prevent CTRL+c/CTRL+d from killing session
    if not args.skip_tmux:
        print("Launching into TMUX session, use CTRL+b d to exit")

    region = u.get_region()
    client = u.create_ec2_client()
    ec2 = u.create_ec2_resource()
    response = client.describe_instances()

    username = os.environ.get("USERNAME", "ubuntu")
    print("Using username '%s'" % (username, ))

    instance_list = []
    for instance in ec2.instances.all():
        if instance.state['Name'] != 'running':
            continue

        name = u.get_name(instance.tags)
        if (fragment in name or fragment in str(instance.public_ip_address)
                or fragment in str(instance.id)
                or fragment in str(instance.private_ip_address)):
            instance_list.append((u.toseconds(instance.launch_time), instance))

    from tzlocal import get_localzone  # $ pip install tzlocal

    filtered_instance_list = u.get_instances(fragment)
    if not filtered_instance_list:
        print("no instance id contains fragment '%s'" % (fragment, ))
        return

    # connect to most recent instance
    print(filtered_instance_list)
    instance = filtered_instance_list[0]
    print("Connecting to ", u.get_name(instance), " launched ",
          instance.launch_time.astimezone(get_localzone()))
    cmd = ''
    keypair_fn = u.get_keypair_fn()
    cmd = make_cmd(keypair_fn, username, instance.public_ip_address)

    print(cmd)
    result = os.system(cmd)
    if username == 'ubuntu':
        username = '******'
    elif username == 'ec2-user':
        username = '******'

    if result != 0:
        print("ssh failed with code %d, trying username %s" %
              (result, username))
    cmd = make_cmd(keypair_fn, username, instance.public_ip_address)
    os.system(cmd)
Exemple #9
0
def main():
    if len(sys.argv) < 2:
        mode = 'list'
    else:
        mode = sys.argv[1]

    if mode == 'list':
        list_vpcs()
    elif mode == 'delete':
        assert len(sys.argv) == 3

        assert 'AWS_DEFAULT_REGION' in os.environ
        client = u.create_ec2_client()
        ec2 = u.create_ec2_resource()
        response = client.describe_vpcs()
        for vpc_response in response['Vpcs']:
            vpc_name = _get_name(vpc_response.get('Tags', []))
            vpc = ec2.Vpc(vpc_response['VpcId'])
            if vpc_name == sys.argv[2] or vpc.id == sys.argv[2]:
                print("Deleting VPC name=%s, id=%s" % (vpc_name, vpc.id))

                for subnet in vpc.subnets.all():
                    print("Deleting subnet %s" % (subnet.id))
                    assert u.is_good_response(subnet.delete())

                for gateway in vpc.internet_gateways.all():
                    print("Deleting gateway %s" % (gateway.id))
                    assert u.is_good_response(
                        gateway.detach_from_vpc(VpcId=vpc.id))
                    assert u.is_good_response(gateway.delete())

                for security_group in vpc.security_groups.all():
                    try:
                        assert u.is_good_response(security_group.delete())
                    except Exception as e:
                        print("Failed with " + str(e))

                for route_table in vpc.route_tables.all():
                    print("Deleting route table %s" % (route_table.id))
                    try:
                        assert u.is_good_response(route_table.delete())
                    except Exception as e:
                        print("Failed with " + str(e))

                if u.is_good_response(client.delete_vpc(VpcId=vpc.id)):
                    print("Succeeded deleting VPC ", vpc.id)
Exemple #10
0
def placement_group_setup(group_name):
  """Creates placement group if necessary. Returns True if new placement
  group was created, False otherwise."""
  
  existing_placement_groups = u.get_placement_group_dict()

  group = existing_placement_groups.get(group_name, None)
  if group:
    assert group.state == 'available'
    assert group.strategy == 'cluster'
    print("Reusing group ", group.name)
    return group

  print("Creating group "+group_name)
  ec2 = u.create_ec2_resource()
  group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster')
  return group
Exemple #11
0
def cancel_spot_requests():
    ec2 = u.create_ec2_resource()
    client = u.create_ec2_client()
    for request in client.describe_spot_instance_requests(
    )['SpotInstanceRequests']:
        state = request['State']
        if state == 'cancelled' or state == 'closed':
            continue

        launch_spec = request['LaunchSpecification']

        print('cancelling', request['SpotInstanceRequestId'],
              launch_spec['InstanceType'], launch_spec['KeyName'],
              request['State'])

        client.cancel_spot_instance_requests(
            SpotInstanceRequestIds=[request['SpotInstanceRequestId']])
Exemple #12
0
def get_instance(fragment):
    ec2 = u.create_ec2_resource()
    instances = [(u.seconds_from_datetime(i.launch_time), i)
                 for i in ec2.instances.all()]
    # latest instance first
    sorted_instances = reversed(sorted(instances, key=itemgetter(0)))
    for (seconds, instance) in sorted_instances:
        name = u.get_name(instance.tags)
        if fragment in u.get_name(instance.tags):
            hours_ago = (time.time() - seconds) / 3600
            hours_ago += 8  # adjust for time being in UTC
            print("Found instance %s launched %.1f hours ago" % (
                name,
                hours_ago,
            ))
            return instance
    print("Found nothing matching", fragment)
Exemple #13
0
def list_instances():
    ec2 = u.create_ec2_resource()
    instances = [(u.seconds_from_datetime(i.launch_time), i)
                 for i in ec2.instances.all()]
    sorted_instances = sorted(instances, key=itemgetter(0))

    for (seconds, instance) in sorted_instances:
        hours_ago = (time.time() - seconds) / 3600
        hours_ago += 8  # adjust for time being in UTC
        if instance.state['Name'] != 'running':
            continue
        if not (LIMIT_TO_KEY in instance.key_name):
            continue

        print("%4s %20s %10s %20s %s %s" %
              (int(hours_ago), u.get_name(instance.tags),
               instance.instance_type, instance.public_ip_address,
               instance.private_ip_address, instance.id))
Exemple #14
0
def attach_instance_ebs(aws_instance, tag, unix_device=u.DEFAULT_UNIX_DEVICE):
    """Attaches volume to instance. Will try to detach volume if it's already mounted somewhere else. Will retry indefinitely on error."""

    ec2 = u.create_ec2_resource()
    v = list(
        ec2.volumes.filter(Filters=[{
            'Name': 'tag:Name',
            'Values': [tag]
        }]).all())
    assert (v), f"Volume {tag} not found."
    v = v[0]
    already_attached = v.attachments and v.attachments[0][
        'InstanceId'] == aws_instance.id
    if already_attached:
        print(f'volume {v} already attached')
        return
    if v.state != 'available':
        response = v.detach_from_instance()
        print(
            f'Detaching from current instance: response={response.get("State", "none")}'
        )
    while True:
        try:
            response = v.attach_to_instance(InstanceId=aws_instance.id,
                                            Device=unix_device)
            print(
                f'Attaching to current instance: response={response.get("State", "none")}'
            )

        # sometimes have unrecoverable failure on brand new instance with
        # possibly because of https://forums.aws.amazon.com/thread.jspa?threadID=66192
        #    Error attaching volume: (An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use). Retrying in 5 An error occurred (InvalidParameterValue) when calling the AttachVolume operation: Invalid value '/dev/xvdf' for unixDevice. Attachment point /dev/xvdf is already in use

        except Exception as e:
            print(f"Failed attaching ({v.id}) to ({aws_instance.id})")
            print(
                f'Error attaching volume: ({e}). Retrying in {ATTACH_WAIT_INTERVAL_SEC}',
                e)
            time.sleep(ATTACH_WAIT_INTERVAL_SEC)
            continue
        else:
            print('Attachment successful')
            break
Exemple #15
0
def list_ebss():
    """Print list of instances with their attached volume id/size to console, ie
master-us-east-1a.masters.df86c4e8-pachydermcluster.kubernetes.com: vol-0f0e841d0cc657002 (20),vol-06fb03280cf2598fb (20),vol-0e7ef0896b234db53 (64)
nodes.df86c4e8-pachydermcluster.kubernetes.com: vol-012367900cd8dae8c (128)
nodes.df86c4e8-pachydermcluster.kubernetes.com: vol-0a98ee5f7f155b2b7 (128),vol-048e29f604d2900a7 (100)
imagenet: vol-024347797a6ab11e8 (1500)
api_service_prod: vol-0c36c9f21bb6be8a6 (8)
box00.gpubox.0: vol-0c69c68295a89cde5 (50)
  """

    ec2 = u.create_ec2_resource()
    instances = [(u.seconds_from_datetime(i.launch_time), i)
                 for i in ec2.instances.all()]
    sorted_instances = sorted(instances, key=itemgetter(0))

    for (seconds, instance) in sorted_instances:

        volumes = instance.volumes.all()
        volume_strs = []
        for v in volumes:
            volume_strs.append("%s (%s)" % (v.id, v.size))
        print("%s: %s" % (u.get_name(instance.tags), ','.join(volume_strs)))
def main():
    ec2 = u.create_ec2_resource()

    assert 'ZONE' in os.environ
    zone = os.environ['ZONE']
    snapshots = []
    # filtering by name doesn't work, Tags are somehow not public?
    # https://stackoverflow.com/questions/51887270/how-to-make-snapshot-tags-public
    #  snapshots = list(ec2.snapshots.filter(Filters=[{'Name':'tag:Name', 'Values':[args.snapshot]}]))

    # use filtering by description instead
    snapshots = list(
        ec2.snapshots.filter(Filters=[{
            'Name': 'description',
            'Values': [args.snapshot]
        }, {
            'Name': 'owner-id',
            'Values': [args.snapshot_account]
        }]))

    assert len(snapshots) > 0, f"no snapshot matching {args.snapshot}"
    assert len(snapshots) < 2, f"multiple snapshots matching {args.snapshot}"
    snap = snapshots[0]
    if not args.size_gb:
        args.size_gb = snap.volume_size

    print(f"Making {args.replicas} {args.size_gb} GB replicas in {zone}")

    for i in range(args.volume_offset, args.replicas + args.volume_offset):
        vol_name = 'imagenet_%02d' % (i)

        vol = ec2.create_volume(Size=args.size_gb,
                                VolumeType='io1',
                                TagSpecifications=create_tags(vol_name),
                                AvailabilityZone=zone,
                                SnapshotId=snap.id,
                                Iops=args.iops)
        print(f"Creating {vol_name} {vol.id}")
Exemple #17
0
def list_ebss():
    """."""

    ec2 = u.create_ec2_resource()

    volumes = list(ec2.volumes.all())
    for vol in volumes:
        if args.io1 and vol.volume_type != 'io1':
            continue
        vol_name = u.get_name(vol)
        if not vol_name:
            vol_name = vol.id
        attached_to = []
        if vol.attachments:
            for attachment in vol.attachments:
                instance_id = attachment["InstanceId"]
                instance = ec2.Instance(instance_id)
                attached_to.append(u.get_name(instance))
        else:
            attached_to.append('<unattached>')

        print("%25s %s %s %s" %
              (vol_name, vol.availability_zone, attached_to, vol.id))
Exemple #18
0
    def make_job(self,
                 role_name,
                 num_tasks=1,
                 skip_existing_job_validation=False,
                 **kwargs):
        """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested."""

        #    u.maybe_create_resources()

        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instance_type = kwargs['instance_type']
        instances = u.lookup_aws_instances(job_name,
                                           instance_type=instance_type)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs.get('ami', '')
        ami_name = kwargs.get('ami_name', '')
        availability_zone = kwargs.get('availability_zone', '')
        if not availability_zone:
            availability_zone = os.environ['ZONE']
        placement_group = kwargs.get('placement_group', '')

        # automatically generated placement_group_name
        use_placement_group = kwargs.get('use_placement_group', False)
        assert use_placement_group == False or placement_group == ''
        if use_placement_group:
            placement_group = self.placement_group_name

        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        # TODO: use heuristics to tell linux type from AMI name
        user_data = kwargs.get('user_data', '')

        if user_data:
            assert user_data.startswith('#!/bin/bash')

        ebs = kwargs.get('ebs', '')
        use_spot = kwargs.get('use_spot', False)
        monitoring = kwargs.get('monitoring', True)

        # always install tmux on Amazon linux types
        # TODO: has no effect for some reason
        # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en
        if linux_type == 'amazon':
            user_data += 'sudo yum install tmux -y'

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            if not skip_existing_job_validation:
                assert len(instances) == num_tasks, (
                    "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually."
                    %
                    (job_name, instances[0].state, len(instances), num_tasks))

            print("Found existing job " + job_name)
            starting_instances = False
            for i in instances:
                if i.state['Name'] == 'stopped':
                    i.start()
                    starting_instances = True

            # TODO: replace with proper wait loop
            if starting_instances:
                while True:
                    print("Waiting forever for instances to start")
                    time.sleep(10)

            print(instances)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            assert not (
                ami and ami_name
            ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name
            assert ami or ami_name, "Must specify at least one of ami and ami_name"
            if ami_name:
                ami = u.lookup_ami_id(ami_name).id
            security_group = u.get_security_group_dict()[u.get_resource_name()]

            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % (
                availability_zone, region, ', '.join(
                    subnet_dict.keys()), availability_zone[:-1])
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # storage setup
            if ebs: args['BlockDeviceMappings'] = ebs
            # network setup
            # TODO: get rid of zone? Zone seems to be required for constructor
            # that allows to enable AssociatePublicIpAddress field
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}
            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg

            if monitoring: args['Monitoring'] = {'Enabled': True}
            args['UserData'] = user_data

            if use_spot: instances = u.create_spot_instances(args)
            else:
                try:
                    instances = ec2.create_instances(**args)
                except Exception as e:
                    print(f"Instance creation failed with ({e})")
                    print("Account number: ", u.get_account_number())
                    print("Region: ", u.get_region())
                    sys.exit()

            assert instances
            assert len(instances) == num_tasks

            # TODO: make instances match their launch indices. This way
            # tasks can figure out which # they are
            for (task_num, instance) in enumerate(instances):
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        # task_name = u.format_task_name(instance.ami_launch_index, role_name,
                        #                                self.name)
                        task_name = u.format_task_name(task_num, job_name)
                        instance.create_tags(Tags=u.make_name(task_name))

                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
def network_setup():
    """Creates VPC if it doesn't already exists, configures it for public
  internet access, returns vpc, subnet, security_group"""

    # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6

    ec2 = u.create_ec2_resource()
    existing_vpcs = u.get_vpc_dict()
    zones = u.get_available_zones()
    if VPC_NAME in existing_vpcs:
        print("Reusing VPC " + VPC_NAME)
        vpc = existing_vpcs[VPC_NAME]
        subnets = list(vpc.subnets.all())
        assert len(subnets) == len(
            zones
        ), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % (
            len(subnets), len(zones))

    else:
        print("Creating VPC " + VPC_NAME)
        vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16')

        # enable DNS on the VPC
        response = vpc.modify_attribute(EnableDnsHostnames={"Value": True})
        assert u.is_good_response(response)
        response = vpc.modify_attribute(EnableDnsSupport={"Value": True})
        assert u.is_good_response(response)

        vpc.create_tags(Tags=u.make_name(VPC_NAME))
        vpc.wait_until_available()

    gateways = u.get_gateway_dict(vpc)
    if DEFAULT_NAME in gateways:
        print("Reusing gateways " + DEFAULT_NAME)
    else:
        print("Creating gateway " + DEFAULT_NAME)
        ig = ec2.create_internet_gateway()
        ig.attach_to_vpc(VpcId=vpc.id)
        ig.create_tags(Tags=u.make_name(DEFAULT_NAME))

        # check that attachment succeeded
        # TODO: sometimes get
        # AssertionError: vpc vpc-33d0804b is in state None

        attach_state = u.get1(ig.attachments, State=-1, VpcId=vpc.id)
        assert attach_state == 'available', "vpc %s is in state %s" % (
            vpc.id, attach_state)

        route_table = vpc.create_route_table()
        route_table.create_tags(Tags=u.make_name(ROUTE_TABLE_NAME))

        dest_cidr = '0.0.0.0/0'
        route = route_table.create_route(DestinationCidrBlock=dest_cidr,
                                         GatewayId=ig.id)
        # check success
        for route in route_table.routes:
            # result looks like this
            # ec2.Route(route_table_id='rtb-a8b438cf',
            #    destination_cidr_block='0.0.0.0/0')
            if route.destination_cidr_block == dest_cidr:
                break
        else:
            # sometimes get
            #      AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')]
            # TODO: add a wait/retry?
            assert False, "Route for %s not found in %s" % (dest_cidr,
                                                            route_table.routes)

        assert len(zones) <= 16  # for cidr/20 to fit into cidr/16
        ip = 0
        for zone in zones:
            cidr_block = '192.168.%d.0/20' % (ip, )
            ip += 16
            print("Creating subnet %s in zone %s" % (cidr_block, zone))
            subnet = vpc.create_subnet(CidrBlock=cidr_block,
                                       AvailabilityZone=zone)
            subnet.create_tags(Tags=[{
                'Key': 'Name',
                'Value': f'{VPC_NAME}-subnet'
            }, {
                'Key': 'Region',
                'Value': zone
            }])
            u.wait_until_available(subnet)
            route_table.associate_with_subnet(SubnetId=subnet.id)

    # Creates security group if necessary
    existing_security_groups = u.get_security_group_dict()
    if SECURITY_GROUP_NAME in existing_security_groups:
        print("Reusing security group " + SECURITY_GROUP_NAME)
        security_group = existing_security_groups[SECURITY_GROUP_NAME]
    else:
        print("Creating security group " + SECURITY_GROUP_NAME)
        security_group = ec2.create_security_group(
            GroupName=SECURITY_GROUP_NAME,
            Description=SECURITY_GROUP_NAME,
            VpcId=vpc.id)

        security_group.create_tags(Tags=[{
            "Key": "Name",
            "Value": SECURITY_GROUP_NAME
        }])

        # allow ICMP access for public ping
        security_group.authorize_ingress(CidrIp='0.0.0.0/0',
                                         IpProtocol='icmp',
                                         FromPort=-1,
                                         ToPort=-1)

        # open public ports
        # always include SSH port which is required for basic functionality
        assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access"
        for port in PUBLIC_TCP_RANGES:
            if u.is_list_or_tuple(port):
                assert len(port) == 2
                from_port, to_port = port
            else:
                from_port, to_port = port, port

            response = security_group.authorize_ingress(IpProtocol="tcp",
                                                        CidrIp="0.0.0.0/0",
                                                        FromPort=from_port,
                                                        ToPort=to_port)
            assert u.is_good_response(response)

        for port in PUBLIC_UDP_RANGES:
            if u.is_list_or_tuple(port):
                assert len(port) == 2
                from_port, to_port = port
            else:
                from_port, to_port = port, port

            response = security_group.authorize_ingress(IpProtocol="udp",
                                                        CidrIp="0.0.0.0/0",
                                                        FromPort=from_port,
                                                        ToPort=to_port)
            assert u.is_good_response(response)

        # allow ingress within security group
        # Authorizing ingress doesn't work with names in a non-default VPC,
        # so must use more complicated syntax
        # https://github.com/boto/boto3/issues/158

        for protocol in ['icmp']:
            try:
                rule = {
                    'FromPort': -1,
                    'IpProtocol': protocol,
                    'IpRanges': [],
                    'PrefixListIds': [],
                    'ToPort': -1,
                    'UserIdGroupPairs': [{
                        'GroupId': security_group.id
                    }]
                }
                security_group.authorize_ingress(IpPermissions=[rule])
            except Exception as e:
                if e.response['Error'][
                        'Code'] == 'InvalidPermission.Duplicate':
                    print("Warning, got " + str(e))
                else:
                    assert False, "Failed while authorizing ingress with " + str(
                        e)

        for protocol in ['tcp', 'udp']:
            try:
                rule = {
                    'FromPort': 0,
                    'IpProtocol': protocol,
                    'IpRanges': [],
                    'PrefixListIds': [],
                    'ToPort': 65535,
                    'UserIdGroupPairs': [{
                        'GroupId': security_group.id
                    }]
                }
                security_group.authorize_ingress(IpPermissions=[rule])
            except Exception as e:
                if e.response['Error'][
                        'Code'] == 'InvalidPermission.Duplicate':
                    print("Warning, got " + str(e))
                else:
                    assert False, "Failed while authorizing ingress with " + str(
                        e)

    return vpc, security_group
Exemple #20
0
def main():
    # TODO: also bring down all the instances and wait for them to come down
    region = os.environ['AWS_DEFAULT_REGION']
    if DEFAULT_NAME == 'nexus':
        print("Nexus resources are protected, don't delete them")
        sys.exit()

    print("Deleting %s resources in region %s" % (
        DEFAULT_NAME,
        region,
    ))
    existing_vpcs = u.get_vpc_dict()
    client = u.create_ec2_client()
    ec2 = u.create_ec2_resource()

    def response_type(response):
        return 'ok' if u.is_good_response(response) else 'failed'

    # delete EFS
    efss = u.get_efs_dict()
    efs_id = efss.get(DEFAULT_NAME, '')
    efs_client = u.create_efs_client()
    if efs_id:
        try:
            # delete mount targets first
            print("About to delete %s (%s)" % (efs_id, DEFAULT_NAME))
            response = efs_client.describe_mount_targets(FileSystemId=efs_id)
            assert u.is_good_response(response)
            for mount_response in response['MountTargets']:
                subnet = ec2.Subnet(mount_response['SubnetId'])
                zone = subnet.availability_zone
                state = mount_response['LifeCycleState']
                id = mount_response['MountTargetId']
                ip = mount_response['IpAddress']
                sys.stdout.write('Deleting mount target %s ... ' % (id, ))
                sys.stdout.flush()
                response = efs_client.delete_mount_target(MountTargetId=id)
                print(response_type(response))

            sys.stdout.write('Deleting EFS %s (%s)... ' %
                             (efs_id, DEFAULT_NAME))
            sys.stdout.flush()
            u.delete_efs_id(efs_id)

        except Exception as e:
            sys.stdout.write('failed\n')
            u.loge(str(e) + '\n')

    if VPC_NAME in existing_vpcs:
        vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id)
        print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id))

        for subnet in vpc.subnets.all():
            try:
                sys.stdout.write("Deleting subnet %s ... " % (subnet.id))
                sys.stdout.write(response_type(subnet.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        for gateway in vpc.internet_gateways.all():
            sys.stdout.write("Deleting gateway %s ... " % (gateway.id))
            # todo: if instances are using VPC, this fails with
            # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway.

            sys.stdout.write('detached ... ' if u.is_good_response(
                gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ')
            sys.stdout.write('deleted ' if u.is_good_response(gateway.delete(
            )) else ' delete_failed ')
            sys.stdout.write('\n')

        def desc(route_table):
            return "%s (%s)" % (route_table.id, u.get_name(route_table.tags))

        for route_table in vpc.route_tables.all():
            sys.stdout.write("Deleting route table %s ... " %
                             (desc(route_table)))
            try:
                sys.stdout.write(response_type(route_table.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        def desc(security_group):
            return "%s (%s, %s)" % (security_group.id,
                                    u.get_name(security_group.tags),
                                    security_group.group_name)

        # TODO: this tries to remove default security group, maybe not remove it?
        for security_group in vpc.security_groups.all():
            sys.stdout.write('Deleting security group %s ... ' %
                             (desc(security_group)))
            try:
                sys.stdout.write(response_type(security_group.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        sys.stdout.write("Deleting VPC %s ... " % (vpc.id))
        sys.stdout.write(response_type(vpc.delete()) + '\n')

    # delete keypair
    keypairs = u.get_keypair_dict()
    keypair = keypairs.get(DEFAULT_NAME, '')
    if keypair:
        try:
            sys.stdout.write("Deleting keypair %s (%s) ... " %
                             (keypair.key_name, DEFAULT_NAME))
            sys.stdout.write(response_type(keypair.delete()) + '\n')
        except Exception as e:
            sys.stdout.write('failed\n')
            u.loge(str(e) + '\n')

    keypair_fn = u.get_keypair_fn(KEYPAIR_NAME)
    if os.path.exists(keypair_fn):
        print("Deleting local keypair file %s" % (keypair_fn, ))
        os.system('rm -f ' + keypair_fn)
Exemple #21
0
    help=("which resources to delete, all/network/keypair/efs"))
parser.add_argument('--force-delete-efs',
                    action='store_true',
                    help="force deleting main EFS")

args = parser.parse_args()

EFS_NAME = u.get_resource_name()
VPC_NAME = u.get_resource_name()
SECURITY_GROUP_NAME = u.get_resource_name()
ROUTE_TABLE_NAME = u.get_resource_name()
KEYPAIR_NAME = u.get_keypair_name()
EFS_NAME = u.get_resource_name()

client = u.create_ec2_client()
ec2 = u.create_ec2_resource()


def response_type(response):
    return 'ok' if u.is_good_response(response) else 'failed'


def delete_efs():
    efss = u.get_efs_dict()
    efs_id = efss.get(EFS_NAME, '')
    efs_client = u.create_efs_client()
    if efs_id:
        try:
            # delete mount targets first
            print("About to delete %s (%s)" % (efs_id, EFS_NAME))
            response = efs_client.describe_mount_targets(FileSystemId=efs_id)
Exemple #22
0
    print_response(inspect.getframeinfo(inspect.currentframe())[2], route)


def describe_route_tables(ec2_client):
    # https://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.Client.describe_route_tables
    response = ec2_client.describe_route_tables()
    print_response(inspect.getframeinfo(inspect.currentframe())[2], response)


if __name__ == '__main__':
    aws = {}
    # profileを使い分ける場合には、profileをセット
    session = boto3.Session(profile_name='my-profile')
    # 使用するクライアントとリソースを作成
    client = create_ec2_client(session)
    resource = create_ec2_resource(session)

    # VPCの作成と確認
    aws['vpc_id'] = create_vpc(client)
    add_vpc_name_tag(resource, aws['vpc_id'])
    describe_vpc(client)

    # サブネットの作成
    # アベイラビリティゾーンの確認
    zones = describe_availability_zones(client)
    # 最初のアベイラビリティゾーンを使用するアベイラビリティゾーンとする
    first_zone = zones['AvailabilityZones'][0]['ZoneName']
    print_response('first availability zone', first_zone)
    subnet = create_vpc_subnet(resource, aws['vpc_id'], first_zone, '192.168.1.0/24')
    aws['public_subnet_id'] = subnet.subnet_id
    # サブネットの名前タグを追加
Exemple #23
0
    def make_job(self, role_name, num_tasks=1, **kwargs):
        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instances = u.lookup_aws_instances(job_name)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs['ami']
        instance_type = kwargs['instance_type']
        availability_zone = kwargs['availability_zone']
        placement_group = kwargs.get('placement_group', '')
        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        user_data = kwargs.get('user_data', '')

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            assert len(instances) == num_tasks, (
                "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually."
                % (len(instances), num_tasks))
            print("Found existing job " + job_name)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            security_group = u.get_security_group_dict()[u.get_resource_name()]
            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % (
                availability_zone, region, ', '.join(subnet_dict.keys()))
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # network setup
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}

            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg
            args['UserData'] = user_data

            instances = ec2.create_instances(**args)
            assert len(instances) == num_tasks

            # assign proper names to tasks
            for instance in instances:
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        task_name = u.format_task_name(
                            instance.ami_launch_index, role_name, self.name)
                        # TODO: use instance.create_tags instead like in create_resources.py
                        ec2.create_tags(Resources=[instance.id],
                                        Tags=u.make_name(task_name))
                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job