def keypair_setup(): """Creates keypair if necessary, saves private key locally, returns contents of private key file.""" os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION) keypair_name = u.get_keypair_name() keypair = u.get_keypair_dict().get(keypair_name, None) keypair_fn = u.get_keypair_fn() if keypair: print("Reusing keypair " + keypair_name) # check that local pem file exists and is readable assert os.path.exists( keypair_fn ), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % ( keypair_name, keypair_fn, keypair_name) keypair_contents = open(keypair_fn).read() assert len(keypair_contents) > 0 else: print("Creating keypair " + keypair_name) ec2 = u.get_ec2_resource() assert not os.path.exists( keypair_fn ), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % ( keypair_fn) keypair = ec2.create_key_pair(KeyName=keypair_name) open(keypair_fn, 'w').write(keypair.key_material) os.system('chmod 400 ' + keypair_fn) return keypair
def main(): ec2 = u.get_ec2_resource() zone = u.get_zone() # use filtering by description since Name is not public # snapshots = list(ec2.snapshots.filter(Filters=[{'Name': 'description', 'Values': [args.snapshot]}, # {'Name': 'owner-id', 'Values': [args.snapshot_account]}])) snap = None if not args.delete: snapshots = list(ec2.snapshots.filter(Filters=[{"Name": "description", "Values": [args.snapshot]}])) if not snapshots: raise ValueError(f"no snapshot matching {args.snapshot}") if len(snapshots) >= 2: raise ValueError(f"multiple snapshots matching {args.snapshot}") snap = snapshots[0] if not args.size_gb: args.size_gb = snap.volume_size # list existing volumes vols = {} for vol in ec2.volumes.all(): vols[u.get_name(vol)] = vol print(f"{'Deleting' if args.delete else 'Making'} {args.replicas} {args.size_gb} GB replicas in {zone}") for i in range(args.volume_offset, args.replicas + args.volume_offset): vol_name = f"imagenet_{zone[-2:]}_{i:02d}" if args.delete: print(f"Deleting {vol_name}") if vol_name not in vols: print(" Not found") continue else: try: vols[vol_name].delete() except ValueError as e: print(f"Deletion of {vol_name} failed with {e}") continue if vol_name in vols: print(f"{vol_name} exists, skipping") else: vol = ec2.create_volume( Size=args.size_gb, TagSpecifications=create_volume_tags(vol_name), AvailabilityZone=zone, SnapshotId=snap.id, Iops=11500, VolumeType="io1", ) print(f"Creating {vol_name} {vol.id}")
def placement_group_setup(group_name): """Creates placement group if necessary. Returns True if new placement group was created, False otherwise.""" existing_placement_groups = u.get_placement_group_dict() group = existing_placement_groups.get(group_name, None) if group: assert group.state == 'available' assert group.strategy == 'cluster' print("Reusing group ", group.name) return group print("Creating group " + group_name) ec2 = u.get_ec2_resource() group = ec2.create_placement_group(GroupName=group_name, Strategy='cluster') return group
def mount_imagenet(job: ncluster.aws_backend.Job): """Attaches EBS disks with imagenet data to each task of the job.""" task0 = job.tasks[0] zone = u.get_zone() vols = {} ec2 = u.get_ec2_resource() for vol in ec2.volumes.all(): vols[u.get_name(vol)] = vol attach_attempted = False for i, t in enumerate(job.tasks): vol_name = f'imagenet_{zone[-2:]}_{i+args.offset:02d}' assert vol_name in vols, f"Volume {vol_name} not found, set your NCLUSTER_ZONE={zone} and run replicate_imagenet.py" vol = vols[vol_name] print(f"Attaching {vol_name} to {t.name}") if vol.attachments: instance = ec2.Instance(vol.attachments[0]['InstanceId']) if instance.id == t.instance.id: print(f"{vol_name} already attached") continue else: # attached to some other instance, detach print(f"detaching {vol_name} from {u.get_name(instance)}") vol.detach_from_instance() while vol.state != 'available': vol.reload() time.sleep(5) print( f"waiting for detachment from {u.get_name(instance)}") vol.attach_to_instance(InstanceId=t.instance.id, Device=DEFAULT_UNIX_DEVICE) attach_attempted = True else: vol.attach_to_instance(InstanceId=t.instance.id, Device=DEFAULT_UNIX_DEVICE) attach_attempted = True if attach_attempted: time.sleep(2) # wait for attachment to succeed i = 0 vol_name = f'imagenet_{zone[-2:]}_{i+args.offset:02d}' vol = vols[vol_name] vol.reload() assert vol.attachments[0]['InstanceId'] == job.tasks[0].instance.id def strip_dev(d): return d[len('/dev/'):] # attach the volume if needed df_output = task0.run('df', return_output=True) actual_device = DEFAULT_UNIX_DEVICE if '/data' not in df_output: # hack for p3dn's ignoring device name during volume attachment lsblk_output = task0.run('lsblk', return_output=True) if strip_dev(DEFAULT_UNIX_DEVICE) not in lsblk_output: actual_device = '/dev/nvme3n1' assert strip_dev(actual_device) in lsblk_output, f"Hack for p3dn failed, {actual_device} not found, " \ f"available devices '{lsblk_output}'" job.run( f'sudo mkdir -p /data && sudo chown `whoami` /data && sudo mount {actual_device} /data' ) while '/data' not in task0.run('df', return_output=True): time.sleep(ATTACH_WAIT_INTERVAL_SEC) print(f"Waiting for attachment")
def network_setup() -> Tuple[Any, Any]: """Creates VPC if it doesn't already exists, configures it for public internet access, returns vpc, subnet, security_group""" # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 ec2 = u.get_ec2_resource() client = u.get_ec2_client() existing_vpcs = u.get_vpc_dict() zones = u.get_zones() # create VPC from scratch. Remove this if default VPC works well enough. create_non_default_vpc = False if create_non_default_vpc: vpc_name = u.get_vpc_name() if u.get_vpc_name() in existing_vpcs: print("Reusing VPC " + vpc_name) vpc = existing_vpcs[vpc_name] subnets = list(vpc.subnets.all()) assert len(subnets) == len( zones ), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( len(subnets), len(zones)) else: print("Creating VPC " + vpc_name) vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') # enable DNS on the VPC response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) assert u.is_good_response(response) response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) assert u.is_good_response(response) vpc.create_tags(Tags=u.create_name_tags(vpc_name)) vpc.wait_until_available() gateways = u.get_gateway_dict(vpc) gateway_name = u.get_gateway_name() if gateway_name in gateways: print("Reusing gateways " + gateway_name) else: print("Creating internet gateway " + gateway_name) ig = ec2.create_internet_gateway() ig.attach_to_vpc(VpcId=vpc.id) ig.create_tags(Tags=u.create_name_tags(gateway_name)) # check that attachment succeeded attach_state = u.extract_attr_for_match(ig.attachments, State=-1, VpcId=vpc.id) assert attach_state == 'available', "vpc %s is in state %s" % ( vpc.id, attach_state) route_table = vpc.create_route_table() route_table_name = u.get_route_table_name() route_table.create_tags(Tags=u.create_name_tags(route_table_name)) dest_cidr = '0.0.0.0/0' route_table.create_route(DestinationCidrBlock=dest_cidr, GatewayId=ig.id) # check success for route in route_table.routes: # result looks like this # ec2.Route(route_table_id='rtb-a8b438cf', # destination_cidr_block='0.0.0.0/0') if route.destination_cidr_block == dest_cidr: break else: # sometimes get # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] # TODO: add a wait/retry? assert False, "Route for %s not found in %s" % ( dest_cidr, route_table.routes) assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 ip = 0 for zone in zones: cidr_block = '192.168.%d.0/20' % (ip, ) ip += 16 print("Creating subnet %s in zone %s" % (cidr_block, zone)) subnet = vpc.create_subnet(CidrBlock=cidr_block, AvailabilityZone=zone) subnet.create_tags(Tags=[{ 'Key': 'Name', 'Value': f'{vpc_name}-subnet' }, { 'Key': 'Region', 'Value': zone }]) response = client.modify_subnet_attribute( MapPublicIpOnLaunch={'Value': True}, SubnetId=subnet.id) assert u.is_good_response(response) u.wait_until_available(subnet) assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?" route_table.associate_with_subnet(SubnetId=subnet.id) # Setup security group for non-default VPC # existing_security_groups = u.get_security_group_dict() # security_group_nd_name = u.get_security_group_nd_name() # if security_group_nd_name in existing_security_groups: # print("Reusing non-default security group " + security_group_nd_name) # security_group_nd = existing_security_groups[security_group_nd_name] # assert security_group_nd.vpc_id == vpc.id, f"Found non-default security group {security_group_nd} " \ # f"attached to {security_group_nd.vpc_id} but expected {vpc.id}" # else: # security_group_nd = create_security_group(security_group_nd_name, vpc.id) # Setup things on default VPC for zone-agnostic launching vpc = u.get_default_vpc() if not vpc: util.log(f"Creating default VPC for region {u.get_region()}") client.create_default_vpc() vpc = u.get_default_vpc() assert vpc, "Could not create default VPC?" existing_security_groups = u.get_security_group_dict() security_group_name = u.get_security_group_name() if security_group_name in existing_security_groups: print("Reusing security group " + security_group_name) security_group = existing_security_groups[security_group_name] assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \ f"attached to {security_group.vpc_id} but expected {vpc.id}" else: security_group = create_security_group(security_group_name, vpc.id) # Uncomment the following when setting up two VPC's # security_group = create_security_group(security_group_name, vpc.id, security_group_nd) return vpc, security_group
def create_security_group(security_group_name: str, vpc_id: str, other_group: Optional[SecurityGroup] = None): """Creates security group with proper ports open. Optionally allows all traffic from other_group""" print("Creating security group " + security_group_name) ec2 = u.get_ec2_resource() security_group: SecurityGroup = ec2.create_security_group( GroupName=security_group_name, Description=security_group_name, VpcId=vpc_id) security_group.create_tags(Tags=u.create_name_tags(security_group_name)) # allow ICMP access for public ping security_group.authorize_ingress(CidrIp='0.0.0.0/0', IpProtocol='icmp', FromPort=-1, ToPort=-1) # open public ports # always include SSH port which is required for basic functionality assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" for port in PUBLIC_TCP_RANGES: if util.is_iterable(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) for port in PUBLIC_UDP_RANGES: if util.is_iterable(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) def authorize_from_group(this_security_group: SecurityGroup, other_security_group: SecurityGroup): """Helper function to authorize all traffic from other_group. Can be used to authorized within-group traffic as authorize_from_group(group, group)""" # Authorizing ingress doesn't work with security group names in a non-default VPC, # so must use more complicated syntax: https://github.com/boto/boto3/issues/158 response_ = {} for protocol in ['icmp']: try: rule = { 'FromPort': -1, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': -1, 'UserIdGroupPairs': [{ 'GroupId': other_security_group.id }] } response_ = this_security_group.authorize_ingress( IpPermissions=[rule]) except Exception as e: if response_['Error']['Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing icml ingress with " + str( e) for protocol in ['tcp', 'udp']: try: rule = { 'FromPort': 0, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': 65535, 'UserIdGroupPairs': [{ 'GroupId': other_security_group.id }] } response_ = this_security_group.authorize_ingress( IpPermissions=[rule]) except Exception as e: if response_['Error']['Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing tcp/udp ingress with " + str( e) # authorize EFA traffic user_id = u.get_account_number() response = None try: rule = { "IpProtocol": "-1", "Ipv6Ranges": [], "PrefixListIds": [], 'UserIdGroupPairs': [{ 'Description': 'efa', 'GroupId': other_security_group.id, 'UserId': user_id }] } response_ = this_security_group.authorize_ingress( IpPermissions=[rule]) assert u.is_good_response(response_), str(response) rule = { "IpProtocol": "-1", "PrefixListIds": [], 'UserIdGroupPairs': [{ 'Description': 'efa', 'GroupId': other_security_group.id, 'UserId': user_id }] } response_ = this_security_group.authorize_egress( IpPermissions=[rule]) assert u.is_good_response(response_), str(response) except Exception as e: if 'Error' in response_ and 'Code' in response[ 'Error'] and response_['Error'][ 'Code'] == 'InvalidPermission.Duplicate': print( f"Warning while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to " f"{other_security_group.description} ({other_security_group.id}) with message '{e}'" ) else: assert False, ( f"Failed while authorizing ingress from {this_security_group.description} ({this_security_group.id}) to " f"{other_security_group.description} ({other_security_group.id}) with message '{e}' and response '{response}'" ) authorize_from_group(security_group, security_group) # if using multiple security groups, which is required for the case of default + non-default VPC # also authorize all traffic between them if other_group: authorize_from_group(security_group, other_group) authorize_from_group(other_group, security_group) return security_group
type=str, default='all', help="which resources to delete, all/network/keypair/efs") parser.add_argument('--force_delete_efs', action='store_true', help="force deleting main EFS") args = parser.parse_args() EFS_NAME = u.get_prefix() VPC_NAME = u.get_prefix() SECURITY_GROUP_NAME = u.get_prefix() ROUTE_TABLE_NAME = u.get_prefix() KEYPAIR_NAME = u.get_keypair_name() client = u.get_ec2_client() ec2 = u.get_ec2_resource() def response_type(response): return 'ok' if u.is_good_response(response) else 'failed' def delete_efs(): efss = u.get_efs_dict() efs_id = efss.get(EFS_NAME, '') efs_client = u.get_efs_client() if efs_id: try: # delete mount targets first print("About to delete %s (%s)" % (efs_id, EFS_NAME)) response = efs_client.describe_mount_targets(FileSystemId=efs_id)
def network_setup(): """Creates VPC if it doesn't already exists, configures it for public internet access, returns vpc, subnet, security_group""" # from https://gist.github.com/nguyendv/8cfd92fc8ed32ebb78e366f44c2daea6 ec2 = u.get_ec2_resource() client = u.get_ec2_client() existing_vpcs = u.get_vpc_dict() zones = u.get_zones() # create VPC from scratch. Remove this if default VPC works well enough. vpc_name = u.get_vpc_name() if u.get_vpc_name() in existing_vpcs: print("Reusing VPC " + vpc_name) vpc = existing_vpcs[vpc_name] subnets = list(vpc.subnets.all()) assert len(subnets) == len( zones ), "Has %s subnets, but %s zones, something went wrong during resource creation, try delete_resources.py/create_resources.py" % ( len(subnets), len(zones)) else: print("Creating VPC " + vpc_name) vpc = ec2.create_vpc(CidrBlock='192.168.0.0/16') # enable DNS on the VPC response = vpc.modify_attribute(EnableDnsHostnames={"Value": True}) assert u.is_good_response(response) response = vpc.modify_attribute(EnableDnsSupport={"Value": True}) assert u.is_good_response(response) vpc.create_tags(Tags=u.create_name_tags(vpc_name)) vpc.wait_until_available() gateways = u.get_gateway_dict(vpc) gateway_name = u.get_gateway_name() if gateway_name in gateways: print("Reusing gateways " + gateway_name) else: print("Creating internet gateway " + gateway_name) ig = ec2.create_internet_gateway() ig.attach_to_vpc(VpcId=vpc.id) ig.create_tags(Tags=u.create_name_tags(gateway_name)) # check that attachment succeeded attach_state = u.extract_attr_for_match(ig.attachments, State=-1, VpcId=vpc.id) assert attach_state == 'available', "vpc %s is in state %s" % ( vpc.id, attach_state) route_table = vpc.create_route_table() route_table_name = u.get_route_table_name() route_table.create_tags(Tags=u.create_name_tags(route_table_name)) dest_cidr = '0.0.0.0/0' route_table.create_route(DestinationCidrBlock=dest_cidr, GatewayId=ig.id) # check success for route in route_table.routes: # result looks like this # ec2.Route(route_table_id='rtb-a8b438cf', # destination_cidr_block='0.0.0.0/0') if route.destination_cidr_block == dest_cidr: break else: # sometimes get # AssertionError: Route for 0.0.0.0/0 not found in [ec2.Route(route_table_id='rtb-cd9153b0', destination_cidr_block='192.168.0.0/16')] # TODO: add a wait/retry? assert False, "Route for %s not found in %s" % (dest_cidr, route_table.routes) assert len(zones) <= 16 # for cidr/20 to fit into cidr/16 ip = 0 for zone in zones: cidr_block = '192.168.%d.0/20' % (ip, ) ip += 16 print("Creating subnet %s in zone %s" % (cidr_block, zone)) subnet = vpc.create_subnet(CidrBlock=cidr_block, AvailabilityZone=zone) subnet.create_tags(Tags=[{ 'Key': 'Name', 'Value': f'{vpc_name}-subnet' }, { 'Key': 'Region', 'Value': zone }]) response = client.modify_subnet_attribute( MapPublicIpOnLaunch={'Value': True}, SubnetId=subnet.id) assert u.is_good_response(response) u.wait_until_available(subnet) assert subnet.map_public_ip_on_launch, "Subnet doesn't enable public IP by default, why?" route_table.associate_with_subnet(SubnetId=subnet.id) # Use default VPC from now on vpc = u.get_default_vpc() if not vpc: util.log(f"Creating default VPC for region {u.get_region()}") client.create_default_vpc() vpc = u.get_default_vpc() assert vpc, "Could not create default VPC?" existing_security_groups = u.get_security_group_dict() security_group_name = u.get_security_group_name() if security_group_name in existing_security_groups: print("Reusing security group " + security_group_name) security_group = existing_security_groups[security_group_name] assert security_group.vpc_id == vpc.id, f"Found security group {security_group} " \ f"attached to {security_group.vpc_id} but expected {vpc.id}" else: print("Creating security group " + security_group_name) security_group = ec2.create_security_group( GroupName=security_group_name, Description=security_group_name, VpcId=vpc.id) security_group.create_tags( Tags=u.create_name_tags(security_group_name)) # allow ICMP access for public ping security_group.authorize_ingress(CidrIp='0.0.0.0/0', IpProtocol='icmp', FromPort=-1, ToPort=-1) # open public ports # always include SSH port which is required for basic functionality assert 22 in PUBLIC_TCP_RANGES, "Must enable SSH access" for port in PUBLIC_TCP_RANGES: if util.is_iterable(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="tcp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) for port in PUBLIC_UDP_RANGES: if util.is_iterable(port): assert len(port) == 2 from_port, to_port = port else: from_port, to_port = port, port response = security_group.authorize_ingress(IpProtocol="udp", CidrIp="0.0.0.0/0", FromPort=from_port, ToPort=to_port) assert u.is_good_response(response) # allow ingress within security group # Authorizing ingress doesn't work with names in a non-default VPC, # so must use more complicated syntax # https://github.com/boto/boto3/issues/158 response = {} for protocol in ['icmp']: try: rule = { 'FromPort': -1, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': -1, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } response = security_group.authorize_ingress( IpPermissions=[rule]) except Exception as e: if response['Error']['Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) for protocol in ['tcp', 'udp']: try: rule = { 'FromPort': 0, 'IpProtocol': protocol, 'IpRanges': [], 'PrefixListIds': [], 'ToPort': 65535, 'UserIdGroupPairs': [{ 'GroupId': security_group.id }] } response = security_group.authorize_ingress( IpPermissions=[rule]) except Exception as e: if response['Error']['Code'] == 'InvalidPermission.Duplicate': print("Warning, got " + str(e)) else: assert False, "Failed while authorizing ingress with " + str( e) return vpc, security_group