def delete_network(): existing_vpcs = u.get_vpc_dict() if VPC_NAME in existing_vpcs: vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id) print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) for subnet in vpc.subnets.all(): try: sys.stdout.write("Deleting subnet %s ... " % (subnet.id)) sys.stdout.write(response_type(subnet.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') for gateway in vpc.internet_gateways.all(): sys.stdout.write("Deleting gateway %s ... " % (gateway.id)) # todo: if instances are using VPC, this fails with # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. sys.stdout.write('detached ... ' if u.is_good_response( gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') sys.stdout.write('deleted ' if u.is_good_response(gateway.delete( )) else ' delete_failed ') sys.stdout.write('\n') def desc(route_table): return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) for route_table in vpc.route_tables.all(): sys.stdout.write("Deleting route table %s ... " % (desc(route_table))) try: sys.stdout.write(response_type(route_table.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') def desc(security_group): return "%s (%s, %s)" % (security_group.id, u.get_name(security_group.tags), security_group.group_name) # TODO: this tries to remove default security group, maybe not remove it? for security_group in vpc.security_groups.all(): sys.stdout.write('Deleting security group %s ... ' % (desc(security_group))) try: sys.stdout.write(response_type(security_group.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') sys.stdout.write("Deleting VPC %s ... " % (vpc.id)) try: sys.stdout.write(response_type(vpc.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n')
def make_job(self, role_name, num_tasks=1, skip_existing_job_validation=False, **kwargs): """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested.""" # u.maybe_create_resources() assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instance_type = kwargs['instance_type'] instances = u.lookup_aws_instances(job_name, instance_type=instance_type) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs.get('ami', '') ami_name = kwargs.get('ami_name', '') availability_zone = kwargs.get('availability_zone', '') if not availability_zone: availability_zone = os.environ['ZONE'] placement_group = kwargs.get('placement_group', '') # automatically generated placement_group_name use_placement_group = kwargs.get('use_placement_group', False) assert use_placement_group == False or placement_group == '' if use_placement_group: placement_group = self.placement_group_name install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') # TODO: use heuristics to tell linux type from AMI name user_data = kwargs.get('user_data', '') if user_data: assert user_data.startswith('#!/bin/bash') ebs = kwargs.get('ebs', '') use_spot = kwargs.get('use_spot', False) monitoring = kwargs.get('monitoring', True) # always install tmux on Amazon linux types # TODO: has no effect for some reason # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en if linux_type == 'amazon': user_data += 'sudo yum install tmux -y' if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: if not skip_existing_job_validation: assert len(instances) == num_tasks, ( "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually." % (job_name, instances[0].state, len(instances), num_tasks)) print("Found existing job " + job_name) starting_instances = False for i in instances: if i.state['Name'] == 'stopped': i.start() starting_instances = True # TODO: replace with proper wait loop if starting_instances: while True: print("Waiting forever for instances to start") time.sleep(10) print(instances) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) assert not ( ami and ami_name ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name assert ami or ami_name, "Must specify at least one of ami and ami_name" if ami_name: ami = u.lookup_ami_id(ami_name).id security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % ( availability_zone, region, ', '.join( subnet_dict.keys()), availability_zone[:-1]) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # storage setup if ebs: args['BlockDeviceMappings'] = ebs # network setup # TODO: get rid of zone? Zone seems to be required for constructor # that allows to enable AssociatePublicIpAddress field args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg if monitoring: args['Monitoring'] = {'Enabled': True} args['UserData'] = user_data if use_spot: instances = u.create_spot_instances(args) else: try: instances = ec2.create_instances(**args) except Exception as e: print(f"Instance creation failed with ({e})") print("Account number: ", u.get_account_number()) print("Region: ", u.get_region()) sys.exit() assert instances assert len(instances) == num_tasks # TODO: make instances match their launch indices. This way # tasks can figure out which # they are for (task_num, instance) in enumerate(instances): while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" # task_name = u.format_task_name(instance.ami_launch_index, role_name, # self.name) task_name = u.format_task_name(task_num, job_name) instance.create_tags(Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def network_setup(): """Creates VPC if it doesn't already exists, configures it for public internet access, returns vpc, subnet, security_group""" # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 ec2 = u.create_ec2_resource() existing_vpcs = u.get_vpc_dict() zones = u.get_available_zones() if VPC_NAME in existing_vpcs: print("Reusing VPC " + VPC_NAME) vpc = existing_vpcs[VPC_NAME] subnets = list(vpc.subnets.all()) assert len(subnets) == len( zones ), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( len(subnets), len(zones)) else: print("Creating VPC " + VPC_NAME) vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') # enable DNS on the VPC response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) assert u.is_good_response(response) response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) assert u.is_good_response(response) vpc.create_tags(Tags=u.make_name(VPC_NAME)) vpc.wait_until_available() gateways = u.get_gateway_dict(vpc) if DEFAULT_NAME in gateways: print("Reusing gateways " + DEFAULT_NAME) else: print("Creating gateway " + DEFAULT_NAME) ig = ec2.create_internet_gateway() ig.attach_to_vpc(VpcId=vpc.id) ig.create_tags(Tags=u.make_name(DEFAULT_NAME)) # check that attachment succeeded # TODO: sometimes get # AssertionError: vpc vpc-33d0804b is in state None attach_state = u.get1(ig.attachments, State=-1, VpcId=vpc.id) assert attach_state == 'available', "vpc %s is in state %s" % ( vpc.id, attach_state) route_table = vpc.create_route_table() route_table.create_tags(Tags=u.make_name(ROUTE_TABLE_NAME)) dest_cidr = '0.0.0.0/0' route = route_table.create_route(DestinationCidrBlock=dest_cidr, GatewayId=ig.id) # check success for route in route_table.routes: # result looks like this # ec2.Route(route_table_id='rtb-a8b438cf', # destination_cidr_block='0.0.0.0/0') if route.destination_cidr_block == dest_cidr: break else: # sometimes get # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] # TODO: add a wait/retry? assert False, "Route for %s not found in %s" % (dest_cidr, route_table.routes) assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 ip = 0 for zone in zones: cidr_block = '192.168.%d.0/20' % (ip, ) ip += 16 print("Creating subnet %s in zone %s" % (cidr_block, zone)) subnet = vpc.create_subnet(CidrBlock=cidr_block, AvailabilityZone=zone) subnet.create_tags(Tags=[{ 'Key': 'Name', 'Value': f'{VPC_NAME}-subnet' }, { 'Key': 'Region', 'Value': zone }]) u.wait_until_available(subnet) route_table.associate_with_subnet(SubnetId=subnet.id) # Creates security group if necessary existing_security_groups = u.get_security_group_dict() if SECURITY_GROUP_NAME in existing_security_groups: print("Reusing security group " + SECURITY_GROUP_NAME) security_group = existing_security_groups[SECURITY_GROUP_NAME] else: print("Creating security group " + SECURITY_GROUP_NAME) security_group = ec2.create_security_group( GroupName=SECURITY_GROUP_NAME, Description=SECURITY_GROUP_NAME, VpcId=vpc.id) security_group.create_tags(Tags=[{ "Key": "Name", "Value": SECURITY_GROUP_NAME }]) # allow ICMP access for public ping security_group.authorize_ingress(CidrIp='0.0.0.0/0', IpProtocol='icmp', FromPort=-1, ToPort=-1) # open public ports # always include SSH port which is required for basic functionality assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" for port in PUBLIC_TCP_RANGES: if u.is_list_or_tuple(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) for port in PUBLIC_UDP_RANGES: if u.is_list_or_tuple(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) # allow ingress within security group # Authorizing ingress doesn't work with names in a non-default VPC, # so must use more complicated syntax # https://github.com/boto/boto3/issues/158 for protocol in ['icmp']: try: rule = { 'FromPort': -1, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': -1, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } security_group.authorize_ingress(IpPermissions=[rule]) except Exception as e: if e.response['Error'][ 'Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) for protocol in ['tcp', 'udp']: try: rule = { 'FromPort': 0, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': 65535, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } security_group.authorize_ingress(IpPermissions=[rule]) except Exception as e: if e.response['Error'][ 'Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) return vpc, security_group
def make_job(self, role_name, num_tasks=1, **kwargs): assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instances = u.lookup_aws_instances(job_name) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs['ami'] instance_type = kwargs['instance_type'] availability_zone = kwargs['availability_zone'] placement_group = kwargs.get('placement_group', '') install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') user_data = kwargs.get('user_data', '') if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + job_name) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % ( availability_zone, region, ', '.join(subnet_dict.keys())) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg args['UserData'] = user_data instances = ec2.create_instances(**args) assert len(instances) == num_tasks # assign proper names to tasks for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" task_name = u.format_task_name( instance.ami_launch_index, role_name, self.name) # TODO: use instance.create_tags instead like in create_resources.py ec2.create_tags(Resources=[instance.id], Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def server_job(name, num_tasks=1, instance_type=None, install_script='', placement_group='', ami='', availability_zone='', linux_type=DEFAULT_LINUX_TYPE): """Creates a job on AWS cluster with publicly facing ports. Reuse requires that that job launched previous under same name has identical settings (number of tasks/instace type/placement group) """ global SSH_KEY_PATH DEFAULT_NAME = u.RESOURCE_NAME security_group = u.get_security_group_dict()[DEFAULT_NAME] keypair = u.get_keypair_dict()[DEFAULT_NAME] # get availability zone -> subnet dictionary vpc = u.get_vpc_dict()[DEFAULT_NAME] subnet_dict = {} for subnet in vpc.subnets.all(): zone = subnet.availability_zone assert zone not in subnet_dict, "More than one subnet in %s, why?" % ( zone, ) subnet_dict[zone] = subnet subnet = subnet_dict[availability_zone] global ROOT_INSTALL_SCRIPT if linux_type == 'ubuntu': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_UBUNTU elif linux_type == 'amazon': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_AMAZON else: assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'." if instance_type is None: instance_type = 'c5.large' instances = lookup_aws_instances(name) # todo: get rid of this global variable? SSH_KEY_PATH = "%s/%s-%s.pem" % ( os.environ["HOME"], DEFAULT_NAME, os.environ['AWS_DEFAULT_REGION'], ) if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number" " of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + name) else: print("Launching new job %s into VPC %s" % (name, DEFAULT_NAME)) ec2 = boto3.resource('ec2') if placement_group: _maybe_create_placement_group(placement_group) print("Requesting %d %s" % (num_tasks, instance_type)) if not ami: ami = os.environ.get('AMI', '') assert ami, "No AMI specified, need AMI env-var or explicit parameter" args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg instances = ec2.create_instances(**args) # todo: use task index in name for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" tag = ec2.create_tags(Resources=[instance.id], Tags=[{ 'Key': 'Name', 'Value': name }]) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) assert len(instances) == num_tasks print('{} Instances created'.format(len(instances))) job = Job(name, instances=instances, install_script=install_script, linux_type=linux_type) return job
def main(): # TODO: also bring down all the instances and wait for them to come down region = os.environ['AWS_DEFAULT_REGION'] if DEFAULT_NAME == 'nexus': print("Nexus resources are protected, don't delete them") sys.exit() print("Deleting %s resources in region %s" % ( DEFAULT_NAME, region, )) existing_vpcs = u.get_vpc_dict() client = u.create_ec2_client() ec2 = u.create_ec2_resource() def response_type(response): return 'ok' if u.is_good_response(response) else 'failed' # delete EFS efss = u.get_efs_dict() efs_id = efss.get(DEFAULT_NAME, '') efs_client = u.create_efs_client() if efs_id: try: # delete mount targets first print("About to delete %s (%s)" % (efs_id, DEFAULT_NAME)) response = efs_client.describe_mount_targets(FileSystemId=efs_id) assert u.is_good_response(response) for mount_response in response['MountTargets']: subnet = ec2.Subnet(mount_response['SubnetId']) zone = subnet.availability_zone state = mount_response['LifeCycleState'] id = mount_response['MountTargetId'] ip = mount_response['IpAddress'] sys.stdout.write('Deleting mount target %s ... ' % (id, )) sys.stdout.flush() response = efs_client.delete_mount_target(MountTargetId=id) print(response_type(response)) sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, DEFAULT_NAME)) sys.stdout.flush() u.delete_efs_id(efs_id) except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') if VPC_NAME in existing_vpcs: vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id) print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) for subnet in vpc.subnets.all(): try: sys.stdout.write("Deleting subnet %s ... " % (subnet.id)) sys.stdout.write(response_type(subnet.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') for gateway in vpc.internet_gateways.all(): sys.stdout.write("Deleting gateway %s ... " % (gateway.id)) # todo: if instances are using VPC, this fails with # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. sys.stdout.write('detached ... ' if u.is_good_response( gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') sys.stdout.write('deleted ' if u.is_good_response(gateway.delete( )) else ' delete_failed ') sys.stdout.write('\n') def desc(route_table): return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) for route_table in vpc.route_tables.all(): sys.stdout.write("Deleting route table %s ... " % (desc(route_table))) try: sys.stdout.write(response_type(route_table.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') def desc(security_group): return "%s (%s, %s)" % (security_group.id, u.get_name(security_group.tags), security_group.group_name) # TODO: this tries to remove default security group, maybe not remove it? for security_group in vpc.security_groups.all(): sys.stdout.write('Deleting security group %s ... ' % (desc(security_group))) try: sys.stdout.write(response_type(security_group.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') sys.stdout.write("Deleting VPC %s ... " % (vpc.id)) sys.stdout.write(response_type(vpc.delete()) + '\n') # delete keypair keypairs = u.get_keypair_dict() keypair = keypairs.get(DEFAULT_NAME, '') if keypair: try: sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, DEFAULT_NAME)) sys.stdout.write(response_type(keypair.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') keypair_fn = u.get_keypair_fn(KEYPAIR_NAME) if os.path.exists(keypair_fn): print("Deleting local keypair file %s" % (keypair_fn, )) os.system('rm -f ' + keypair_fn)