def keypair_setup(): """Creates keypair if necessary, saves private key locally, returns contents of private key file.""" existing_keypairs = u.get_keypair_dict() keypair = existing_keypairs.get(KEYPAIR_NAME, None) keypair_fn = u.get_keypair_fn(KEYPAIR_NAME) if keypair: print("Reusing keypair "+KEYPAIR_NAME) # check that local pem file exists and is readable assert os.path.exists(keypair_fn) keypair_contents = open(keypair_fn).read() assert len(keypair_contents)>0 # todo: check that fingerprint matches keypair.key_fingerprint return keypair print("Creating keypair "+KEYPAIR_NAME) ec2 = u.create_ec2_resource() keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME) assert not os.path.exists(keypair_fn), "previous, keypair exists, delete it with 'sudo rm %s'"%(keypair_fn) open(keypair_fn, 'w').write(keypair.key_material) os.system('chmod 400 '+keypair_fn) return keypair
def keypair_setup(): """Creates keypair if necessary, saves private key locally, returns contents of private key file.""" os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION) keypair = u.get_keypair_dict().get(KEYPAIR_NAME, None) keypair_fn = u.get_keypair_fn() if keypair: print("Reusing keypair " + KEYPAIR_NAME) # check that local pem file exists and is readable assert os.path.exists( keypair_fn ), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % ( KEYPAIR_NAME, keypair_fn, KEYPAIR_NAME) keypair_contents = open(keypair_fn).read() assert len(keypair_contents) > 0 # todo: check that fingerprint matches keypair.key_fingerprint else: print("Creating keypair " + KEYPAIR_NAME) ec2 = u.create_ec2_resource() assert not os.path.exists( keypair_fn ), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % ( keypair_fn) keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME) open(keypair_fn, 'w').write(keypair.key_material) os.system('chmod 400 ' + keypair_fn) return keypair
def delete_keypair(): keypairs = u.get_keypair_dict() keypair = keypairs.get(KEYPAIR_NAME, '') if keypair: try: sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, KEYPAIR_NAME)) sys.stdout.write(response_type(keypair.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') keypair_fn = u.get_keypair_fn() if os.path.exists(keypair_fn): print("Deleting local keypair file %s" % (keypair_fn, )) os.system('rm -f ' + keypair_fn)
def make_job(self, role_name, num_tasks=1, skip_existing_job_validation=False, **kwargs): """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested.""" # u.maybe_create_resources() assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instance_type = kwargs['instance_type'] instances = u.lookup_aws_instances(job_name, instance_type=instance_type) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs.get('ami', '') ami_name = kwargs.get('ami_name', '') availability_zone = kwargs.get('availability_zone', '') if not availability_zone: availability_zone = os.environ['ZONE'] placement_group = kwargs.get('placement_group', '') # automatically generated placement_group_name use_placement_group = kwargs.get('use_placement_group', False) assert use_placement_group == False or placement_group == '' if use_placement_group: placement_group = self.placement_group_name install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') # TODO: use heuristics to tell linux type from AMI name user_data = kwargs.get('user_data', '') if user_data: assert user_data.startswith('#!/bin/bash') ebs = kwargs.get('ebs', '') use_spot = kwargs.get('use_spot', False) monitoring = kwargs.get('monitoring', True) # always install tmux on Amazon linux types # TODO: has no effect for some reason # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en if linux_type == 'amazon': user_data += 'sudo yum install tmux -y' if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: if not skip_existing_job_validation: assert len(instances) == num_tasks, ( "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually." % (job_name, instances[0].state, len(instances), num_tasks)) print("Found existing job " + job_name) starting_instances = False for i in instances: if i.state['Name'] == 'stopped': i.start() starting_instances = True # TODO: replace with proper wait loop if starting_instances: while True: print("Waiting forever for instances to start") time.sleep(10) print(instances) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) assert not ( ami and ami_name ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name assert ami or ami_name, "Must specify at least one of ami and ami_name" if ami_name: ami = u.lookup_ami_id(ami_name).id security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % ( availability_zone, region, ', '.join( subnet_dict.keys()), availability_zone[:-1]) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # storage setup if ebs: args['BlockDeviceMappings'] = ebs # network setup # TODO: get rid of zone? Zone seems to be required for constructor # that allows to enable AssociatePublicIpAddress field args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg if monitoring: args['Monitoring'] = {'Enabled': True} args['UserData'] = user_data if use_spot: instances = u.create_spot_instances(args) else: try: instances = ec2.create_instances(**args) except Exception as e: print(f"Instance creation failed with ({e})") print("Account number: ", u.get_account_number()) print("Region: ", u.get_region()) sys.exit() assert instances assert len(instances) == num_tasks # TODO: make instances match their launch indices. This way # tasks can figure out which # they are for (task_num, instance) in enumerate(instances): while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" # task_name = u.format_task_name(instance.ami_launch_index, role_name, # self.name) task_name = u.format_task_name(task_num, job_name) instance.create_tags(Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def make_job(self, role_name, num_tasks=1, **kwargs): assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instances = u.lookup_aws_instances(job_name) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs['ami'] instance_type = kwargs['instance_type'] availability_zone = kwargs['availability_zone'] placement_group = kwargs.get('placement_group', '') install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') user_data = kwargs.get('user_data', '') if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + job_name) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % ( availability_zone, region, ', '.join(subnet_dict.keys())) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg args['UserData'] = user_data instances = ec2.create_instances(**args) assert len(instances) == num_tasks # assign proper names to tasks for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" task_name = u.format_task_name( instance.ami_launch_index, role_name, self.name) # TODO: use instance.create_tags instead like in create_resources.py ec2.create_tags(Resources=[instance.id], Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def server_job(name, num_tasks=1, instance_type=None, install_script='', placement_group='', ami='', availability_zone='', linux_type=DEFAULT_LINUX_TYPE): """Creates a job on AWS cluster with publicly facing ports. Reuse requires that that job launched previous under same name has identical settings (number of tasks/instace type/placement group) """ global SSH_KEY_PATH DEFAULT_NAME = u.RESOURCE_NAME security_group = u.get_security_group_dict()[DEFAULT_NAME] keypair = u.get_keypair_dict()[DEFAULT_NAME] # get availability zone -> subnet dictionary vpc = u.get_vpc_dict()[DEFAULT_NAME] subnet_dict = {} for subnet in vpc.subnets.all(): zone = subnet.availability_zone assert zone not in subnet_dict, "More than one subnet in %s, why?" % ( zone, ) subnet_dict[zone] = subnet subnet = subnet_dict[availability_zone] global ROOT_INSTALL_SCRIPT if linux_type == 'ubuntu': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_UBUNTU elif linux_type == 'amazon': ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_AMAZON else: assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'." if instance_type is None: instance_type = 'c5.large' instances = lookup_aws_instances(name) # todo: get rid of this global variable? SSH_KEY_PATH = "%s/%s-%s.pem" % ( os.environ["HOME"], DEFAULT_NAME, os.environ['AWS_DEFAULT_REGION'], ) if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number" " of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + name) else: print("Launching new job %s into VPC %s" % (name, DEFAULT_NAME)) ec2 = boto3.resource('ec2') if placement_group: _maybe_create_placement_group(placement_group) print("Requesting %d %s" % (num_tasks, instance_type)) if not ami: ami = os.environ.get('AMI', '') assert ami, "No AMI specified, need AMI env-var or explicit parameter" args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg instances = ec2.create_instances(**args) # todo: use task index in name for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" tag = ec2.create_tags(Resources=[instance.id], Tags=[{ 'Key': 'Name', 'Value': name }]) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) assert len(instances) == num_tasks print('{} Instances created'.format(len(instances))) job = Job(name, instances=instances, install_script=install_script, linux_type=linux_type) return job
def main(): assert 'AWS_DEFAULT_REGION' in os.environ, "Must specify default region" region = os.environ.get("AWS_DEFAULT_REGION") assert args.zone.startswith( region), "Availability zone must be in default region." os.system('mkdir -p /tmp/tmux') if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type if args.ami: ami = args.ami else: ami = ami_dict[region] if args.linux_type == 'ubuntu': install_script = INSTALL_SCRIPT_UBUNTU ami_dict = ami_dict_ubuntu elif args.linux_type == 'amazon': install_script = INSTALL_SCRIPT_AMAZON ami_dict = ami_dict_amazon else: assert False, "Unknown linux type " + args.linux_type # # vpc = u.get_vpc_dict()[u.RESOURCE_NAME] # # pick AZ to use for instance based on available subnets # subnets = list(vpc.subnets.all()) # if not subnets: # print("<no subnets>, failing") # sys.exit() # subnets = list(vpc.subnets.all()) # subnet_dict = {} # for subnet in subnets: # zone = subnet.availability_zone # assert zone not in subnet_dict, "More than one subnet in %s, why?" %(zone,) # subnet_dict[zone] = subnet if not args.zone: machine_class = args.instance_type[:2] zone = availability_mapping[region][machine_class][0] print("Chose %s based on availability mapping for %s" % (zone, machine_class)) else: zone = args.zone # subnet = subnet_dict[zone] # print("Available zones: %s" %(', '.join(sorted(subnet_dict.keys())))) # print("Using %-16s %-16s"%(subnet.id, subnet.availability_zone)) print("Launching %s in %s" % (args.name, zone)) security_group = u.get_security_group_dict()[u.RESOURCE_NAME] keypair = u.get_keypair_dict()[u.RESOURCE_NAME] job = aws.server_job(args.name, ami=ami, num_tasks=1, instance_type=args.instance_type, install_script=install_script, availability_zone=zone, linux_type=args.linux_type) job.wait_until_ready() task = job.tasks[0] # this needs DNS to be enabled on VPC # alternative way is to provide direct IP from efs_tool.py efs_id = u.get_efs_dict()[u.RESOURCE_NAME] dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals()) # try mounting EFS several times for i in range(3): try: task.run( "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs && sudo chmod 777 /efs" % (dns, )) print("EFS Mount succeeded") break except Exception as e: print("Got error %s, retrying in 10 seconds" % (str(e))) time.sleep(10) # connect instructions print("To connect:") print(task.connect_instructions)
def main(): # TODO: also bring down all the instances and wait for them to come down region = os.environ['AWS_DEFAULT_REGION'] if DEFAULT_NAME == 'nexus': print("Nexus resources are protected, don't delete them") sys.exit() print("Deleting %s resources in region %s" % ( DEFAULT_NAME, region, )) existing_vpcs = u.get_vpc_dict() client = u.create_ec2_client() ec2 = u.create_ec2_resource() def response_type(response): return 'ok' if u.is_good_response(response) else 'failed' # delete EFS efss = u.get_efs_dict() efs_id = efss.get(DEFAULT_NAME, '') efs_client = u.create_efs_client() if efs_id: try: # delete mount targets first print("About to delete %s (%s)" % (efs_id, DEFAULT_NAME)) response = efs_client.describe_mount_targets(FileSystemId=efs_id) assert u.is_good_response(response) for mount_response in response['MountTargets']: subnet = ec2.Subnet(mount_response['SubnetId']) zone = subnet.availability_zone state = mount_response['LifeCycleState'] id = mount_response['MountTargetId'] ip = mount_response['IpAddress'] sys.stdout.write('Deleting mount target %s ... ' % (id, )) sys.stdout.flush() response = efs_client.delete_mount_target(MountTargetId=id) print(response_type(response)) sys.stdout.write('Deleting EFS %s (%s)... ' % (efs_id, DEFAULT_NAME)) sys.stdout.flush() u.delete_efs_id(efs_id) except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') if VPC_NAME in existing_vpcs: vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id) print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id)) for subnet in vpc.subnets.all(): try: sys.stdout.write("Deleting subnet %s ... " % (subnet.id)) sys.stdout.write(response_type(subnet.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') for gateway in vpc.internet_gateways.all(): sys.stdout.write("Deleting gateway %s ... " % (gateway.id)) # todo: if instances are using VPC, this fails with # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway. sys.stdout.write('detached ... ' if u.is_good_response( gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ') sys.stdout.write('deleted ' if u.is_good_response(gateway.delete( )) else ' delete_failed ') sys.stdout.write('\n') def desc(route_table): return "%s (%s)" % (route_table.id, u.get_name(route_table.tags)) for route_table in vpc.route_tables.all(): sys.stdout.write("Deleting route table %s ... " % (desc(route_table))) try: sys.stdout.write(response_type(route_table.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') def desc(security_group): return "%s (%s, %s)" % (security_group.id, u.get_name(security_group.tags), security_group.group_name) # TODO: this tries to remove default security group, maybe not remove it? for security_group in vpc.security_groups.all(): sys.stdout.write('Deleting security group %s ... ' % (desc(security_group))) try: sys.stdout.write(response_type(security_group.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') sys.stdout.write("Deleting VPC %s ... " % (vpc.id)) sys.stdout.write(response_type(vpc.delete()) + '\n') # delete keypair keypairs = u.get_keypair_dict() keypair = keypairs.get(DEFAULT_NAME, '') if keypair: try: sys.stdout.write("Deleting keypair %s (%s) ... " % (keypair.key_name, DEFAULT_NAME)) sys.stdout.write(response_type(keypair.delete()) + '\n') except Exception as e: sys.stdout.write('failed\n') u.loge(str(e) + '\n') keypair_fn = u.get_keypair_fn(KEYPAIR_NAME) if os.path.exists(keypair_fn): print("Deleting local keypair file %s" % (keypair_fn, )) os.system('rm -f ' + keypair_fn)