Example #1
0
def keypair_setup():
  """Creates keypair if necessary, saves private key locally, returns contents
  of private key file."""
  
  
  existing_keypairs = u.get_keypair_dict()
  keypair = existing_keypairs.get(KEYPAIR_NAME, None)
  keypair_fn = u.get_keypair_fn(KEYPAIR_NAME)
  if keypair:
    print("Reusing keypair "+KEYPAIR_NAME)
    # check that local pem file exists and is readable
    assert os.path.exists(keypair_fn)
    keypair_contents = open(keypair_fn).read()
    assert len(keypair_contents)>0
    # todo: check that fingerprint matches keypair.key_fingerprint
    return keypair
  
  print("Creating keypair "+KEYPAIR_NAME)
  ec2 = u.create_ec2_resource()
  keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME)
  assert not os.path.exists(keypair_fn), "previous, keypair exists, delete it with 'sudo rm %s'"%(keypair_fn)
  
  open(keypair_fn, 'w').write(keypair.key_material)
  os.system('chmod 400 '+keypair_fn)
  return keypair
Example #2
0
def keypair_setup():
    """Creates keypair if necessary, saves private key locally, returns contents
  of private key file."""

    os.system('mkdir -p ' + u.PRIVATE_KEY_LOCATION)

    keypair = u.get_keypair_dict().get(KEYPAIR_NAME, None)
    keypair_fn = u.get_keypair_fn()
    if keypair:
        print("Reusing keypair " + KEYPAIR_NAME)
        # check that local pem file exists and is readable
        assert os.path.exists(
            keypair_fn
        ), "Keypair %s exists, but corresponding .pem file %s is not found, delete keypair %s through console and run again to recreate keypair/.pem together" % (
            KEYPAIR_NAME, keypair_fn, KEYPAIR_NAME)
        keypair_contents = open(keypair_fn).read()
        assert len(keypair_contents) > 0
        # todo: check that fingerprint matches keypair.key_fingerprint
    else:
        print("Creating keypair " + KEYPAIR_NAME)
        ec2 = u.create_ec2_resource()
        assert not os.path.exists(
            keypair_fn
        ), "previous keypair exists, delete it with 'sudo rm %s' and also delete corresponding keypair through console" % (
            keypair_fn)
        keypair = ec2.create_key_pair(KeyName=KEYPAIR_NAME)

        open(keypair_fn, 'w').write(keypair.key_material)
        os.system('chmod 400 ' + keypair_fn)

    return keypair
Example #3
0
def delete_keypair():
    keypairs = u.get_keypair_dict()
    keypair = keypairs.get(KEYPAIR_NAME, '')
    if keypair:
        try:
            sys.stdout.write("Deleting keypair %s (%s) ... " %
                             (keypair.key_name, KEYPAIR_NAME))
            sys.stdout.write(response_type(keypair.delete()) + '\n')
        except Exception as e:
            sys.stdout.write('failed\n')
            u.loge(str(e) + '\n')

    keypair_fn = u.get_keypair_fn()
    if os.path.exists(keypair_fn):
        print("Deleting local keypair file %s" % (keypair_fn, ))
        os.system('rm -f ' + keypair_fn)
Example #4
0
    def make_job(self,
                 role_name,
                 num_tasks=1,
                 skip_existing_job_validation=False,
                 **kwargs):
        """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested."""

        #    u.maybe_create_resources()

        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instance_type = kwargs['instance_type']
        instances = u.lookup_aws_instances(job_name,
                                           instance_type=instance_type)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs.get('ami', '')
        ami_name = kwargs.get('ami_name', '')
        availability_zone = kwargs.get('availability_zone', '')
        if not availability_zone:
            availability_zone = os.environ['ZONE']
        placement_group = kwargs.get('placement_group', '')

        # automatically generated placement_group_name
        use_placement_group = kwargs.get('use_placement_group', False)
        assert use_placement_group == False or placement_group == ''
        if use_placement_group:
            placement_group = self.placement_group_name

        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        # TODO: use heuristics to tell linux type from AMI name
        user_data = kwargs.get('user_data', '')

        if user_data:
            assert user_data.startswith('#!/bin/bash')

        ebs = kwargs.get('ebs', '')
        use_spot = kwargs.get('use_spot', False)
        monitoring = kwargs.get('monitoring', True)

        # always install tmux on Amazon linux types
        # TODO: has no effect for some reason
        # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en
        if linux_type == 'amazon':
            user_data += 'sudo yum install tmux -y'

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            if not skip_existing_job_validation:
                assert len(instances) == num_tasks, (
                    "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually."
                    %
                    (job_name, instances[0].state, len(instances), num_tasks))

            print("Found existing job " + job_name)
            starting_instances = False
            for i in instances:
                if i.state['Name'] == 'stopped':
                    i.start()
                    starting_instances = True

            # TODO: replace with proper wait loop
            if starting_instances:
                while True:
                    print("Waiting forever for instances to start")
                    time.sleep(10)

            print(instances)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            assert not (
                ami and ami_name
            ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name
            assert ami or ami_name, "Must specify at least one of ami and ami_name"
            if ami_name:
                ami = u.lookup_ami_id(ami_name).id
            security_group = u.get_security_group_dict()[u.get_resource_name()]

            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % (
                availability_zone, region, ', '.join(
                    subnet_dict.keys()), availability_zone[:-1])
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # storage setup
            if ebs: args['BlockDeviceMappings'] = ebs
            # network setup
            # TODO: get rid of zone? Zone seems to be required for constructor
            # that allows to enable AssociatePublicIpAddress field
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}
            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg

            if monitoring: args['Monitoring'] = {'Enabled': True}
            args['UserData'] = user_data

            if use_spot: instances = u.create_spot_instances(args)
            else:
                try:
                    instances = ec2.create_instances(**args)
                except Exception as e:
                    print(f"Instance creation failed with ({e})")
                    print("Account number: ", u.get_account_number())
                    print("Region: ", u.get_region())
                    sys.exit()

            assert instances
            assert len(instances) == num_tasks

            # TODO: make instances match their launch indices. This way
            # tasks can figure out which # they are
            for (task_num, instance) in enumerate(instances):
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        # task_name = u.format_task_name(instance.ami_launch_index, role_name,
                        #                                self.name)
                        task_name = u.format_task_name(task_num, job_name)
                        instance.create_tags(Tags=u.make_name(task_name))

                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
Example #5
0
    def make_job(self, role_name, num_tasks=1, **kwargs):
        assert num_tasks >= 0

        # TODO: document launch parameters
        job_name = u.format_job_name(role_name, self.name)
        instances = u.lookup_aws_instances(job_name)
        kwargs = u.merge_kwargs(kwargs, self.kwargs)
        ami = kwargs['ami']
        instance_type = kwargs['instance_type']
        availability_zone = kwargs['availability_zone']
        placement_group = kwargs.get('placement_group', '')
        install_script = kwargs.get('install_script', '')
        skip_efs_mount = kwargs.get('skip_efs_mount', False)
        linux_type = kwargs.get('linux_type', 'ubuntu')
        user_data = kwargs.get('user_data', '')

        if user_data:
            user_data += '\necho userdata_ok >> /tmp/is_initialized\n'

        #    print("Using user_data", user_data)

        # TODO: also make sure instance type is the same
        if instances:
            assert len(instances) == num_tasks, (
                "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually."
                % (len(instances), num_tasks))
            print("Found existing job " + job_name)
        else:
            print("Launching new job %s into VPC %s" %
                  (job_name, u.get_resource_name()))

            security_group = u.get_security_group_dict()[u.get_resource_name()]
            keypair = u.get_keypair_dict()[u.get_keypair_name()]
            vpc = u.get_vpc_dict()[u.get_resource_name()]
            subnet_dict = u.get_subnet_dict(vpc)
            region = u.get_region()
            assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % (
                availability_zone, region, ', '.join(subnet_dict.keys()))
            subnet = subnet_dict[availability_zone]
            ec2 = u.create_ec2_resource()
            u.maybe_create_placement_group(placement_group)

            self.log("Requesting %d %s" % (num_tasks, instance_type))

            args = {
                'ImageId': ami,
                'InstanceType': instance_type,
                'MinCount': num_tasks,
                'MaxCount': num_tasks,
                'KeyName': keypair.name
            }

            # network setup
            args['NetworkInterfaces'] = [{
                'SubnetId': subnet.id,
                'DeviceIndex': 0,
                'AssociatePublicIpAddress': True,
                'Groups': [security_group.id]
            }]

            placement_arg = {'AvailabilityZone': availability_zone}

            if placement_group: placement_arg['GroupName'] = placement_group
            args['Placement'] = placement_arg
            args['UserData'] = user_data

            instances = ec2.create_instances(**args)
            assert len(instances) == num_tasks

            # assign proper names to tasks
            for instance in instances:
                while True:
                    try:
                        # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                        task_name = u.format_task_name(
                            instance.ami_launch_index, role_name, self.name)
                        # TODO: use instance.create_tags instead like in create_resources.py
                        ec2.create_tags(Resources=[instance.id],
                                        Tags=u.make_name(task_name))
                        break
                    except Exception as e:
                        self.log(
                            "create_tags failed with %s, retrying in %d seconds"
                            % (str(e), TIMEOUT_SEC))
                        time.sleep(TIMEOUT_SEC)

        job = Job(self,
                  job_name,
                  instances=instances,
                  install_script=install_script,
                  linux_type=linux_type,
                  user_data=user_data,
                  skip_efs_mount=skip_efs_mount)
        self.jobs.append(job)
        return job
Example #6
0
def server_job(name,
               num_tasks=1,
               instance_type=None,
               install_script='',
               placement_group='',
               ami='',
               availability_zone='',
               linux_type=DEFAULT_LINUX_TYPE):
    """Creates a job on AWS cluster with publicly facing ports.

  Reuse requires that that job launched previous under same name has identical
  settings (number of tasks/instace type/placement group)
  """

    global SSH_KEY_PATH

    DEFAULT_NAME = u.RESOURCE_NAME
    security_group = u.get_security_group_dict()[DEFAULT_NAME]
    keypair = u.get_keypair_dict()[DEFAULT_NAME]
    # get availability zone -> subnet dictionary
    vpc = u.get_vpc_dict()[DEFAULT_NAME]
    subnet_dict = {}
    for subnet in vpc.subnets.all():
        zone = subnet.availability_zone
        assert zone not in subnet_dict, "More than one subnet in %s, why?" % (
            zone, )
        subnet_dict[zone] = subnet
    subnet = subnet_dict[availability_zone]

    global ROOT_INSTALL_SCRIPT
    if linux_type == 'ubuntu':
        ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_UBUNTU
    elif linux_type == 'amazon':
        ROOT_INSTALL_SCRIPT = ROOT_INSTALL_SCRIPT_AMAZON
    else:
        assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'."

    if instance_type is None:
        instance_type = 'c5.large'
    instances = lookup_aws_instances(name)

    # todo: get rid of this global variable?
    SSH_KEY_PATH = "%s/%s-%s.pem" % (
        os.environ["HOME"],
        DEFAULT_NAME,
        os.environ['AWS_DEFAULT_REGION'],
    )

    if instances:
        assert len(instances) == num_tasks, (
            "Found job with same name, but number"
            " of tasks %d doesn't match requested %d, kill job manually." %
            (len(instances), num_tasks))
        print("Found existing job " + name)
    else:
        print("Launching new job %s into VPC %s" % (name, DEFAULT_NAME))

        ec2 = boto3.resource('ec2')
        if placement_group:
            _maybe_create_placement_group(placement_group)

        print("Requesting %d %s" % (num_tasks, instance_type))

        if not ami:
            ami = os.environ.get('AMI', '')

        assert ami, "No AMI specified, need AMI env-var or explicit parameter"

        args = {
            'ImageId': ami,
            'InstanceType': instance_type,
            'MinCount': num_tasks,
            'MaxCount': num_tasks,
            'KeyName': keypair.name
        }

        # network setup
        args['NetworkInterfaces'] = [{
            'SubnetId': subnet.id,
            'DeviceIndex': 0,
            'AssociatePublicIpAddress': True,
            'Groups': [security_group.id]
        }]

        placement_arg = {'AvailabilityZone': availability_zone}
        if placement_group: placement_arg['GroupName'] = placement_group
        args['Placement'] = placement_arg

        instances = ec2.create_instances(**args)

        # todo: use task index in name
        for instance in instances:
            while True:
                try:
                    # sometimes get "An error occurred (InvalidInstanceID.NotFound)"
                    tag = ec2.create_tags(Resources=[instance.id],
                                          Tags=[{
                                              'Key': 'Name',
                                              'Value': name
                                          }])
                    break
                except Exception as e:
                    self.log(
                        "create_tags failed with %s, retrying in %d seconds" %
                        (str(e), TIMEOUT_SEC))
                    time.sleep(TIMEOUT_SEC)

        assert len(instances) == num_tasks
        print('{} Instances created'.format(len(instances)))

    job = Job(name,
              instances=instances,
              install_script=install_script,
              linux_type=linux_type)
    return job
Example #7
0
def main():
    assert 'AWS_DEFAULT_REGION' in os.environ, "Must specify default region"
    region = os.environ.get("AWS_DEFAULT_REGION")
    assert args.zone.startswith(
        region), "Availability zone must be in default region."
    os.system('mkdir -p /tmp/tmux')
    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    if args.ami:
        ami = args.ami
    else:
        ami = ami_dict[region]

    if args.linux_type == 'ubuntu':
        install_script = INSTALL_SCRIPT_UBUNTU
        ami_dict = ami_dict_ubuntu
    elif args.linux_type == 'amazon':
        install_script = INSTALL_SCRIPT_AMAZON
        ami_dict = ami_dict_amazon
    else:
        assert False, "Unknown linux type " + args.linux_type

    # #  vpc = u.get_vpc_dict()[u.RESOURCE_NAME]

    # # pick AZ to use for instance based on available subnets
    # subnets = list(vpc.subnets.all())
    # if not subnets:
    #   print("<no subnets>, failing")
    #   sys.exit()
    # subnets = list(vpc.subnets.all())
    # subnet_dict = {}
    # for subnet in subnets:
    #   zone = subnet.availability_zone
    #   assert zone not in subnet_dict, "More than one subnet in %s, why?" %(zone,)
    #   subnet_dict[zone] = subnet

    if not args.zone:
        machine_class = args.instance_type[:2]
        zone = availability_mapping[region][machine_class][0]
        print("Chose %s based on availability mapping for %s" %
              (zone, machine_class))
    else:
        zone = args.zone

        #  subnet = subnet_dict[zone]
        #  print("Available zones: %s" %(', '.join(sorted(subnet_dict.keys()))))
        #  print("Using %-16s %-16s"%(subnet.id, subnet.availability_zone))

    print("Launching %s in %s" % (args.name, zone))
    security_group = u.get_security_group_dict()[u.RESOURCE_NAME]
    keypair = u.get_keypair_dict()[u.RESOURCE_NAME]

    job = aws.server_job(args.name,
                         ami=ami,
                         num_tasks=1,
                         instance_type=args.instance_type,
                         install_script=install_script,
                         availability_zone=zone,
                         linux_type=args.linux_type)

    job.wait_until_ready()
    task = job.tasks[0]

    # this needs DNS to be enabled on VPC
    # alternative way is to provide direct IP from efs_tool.py
    efs_id = u.get_efs_dict()[u.RESOURCE_NAME]
    dns = "{efs_id}.efs.{region}.amazonaws.com".format(**locals())

    # try mounting EFS several times
    for i in range(3):
        try:
            task.run(
                "sudo mount -t nfs -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 %s:/ /efs && sudo chmod 777 /efs"
                % (dns, ))
            print("EFS Mount succeeded")
            break
        except Exception as e:
            print("Got error %s, retrying in 10 seconds" % (str(e)))
            time.sleep(10)

    # connect instructions
    print("To connect:")
    print(task.connect_instructions)
Example #8
0
def main():
    # TODO: also bring down all the instances and wait for them to come down
    region = os.environ['AWS_DEFAULT_REGION']
    if DEFAULT_NAME == 'nexus':
        print("Nexus resources are protected, don't delete them")
        sys.exit()

    print("Deleting %s resources in region %s" % (
        DEFAULT_NAME,
        region,
    ))
    existing_vpcs = u.get_vpc_dict()
    client = u.create_ec2_client()
    ec2 = u.create_ec2_resource()

    def response_type(response):
        return 'ok' if u.is_good_response(response) else 'failed'

    # delete EFS
    efss = u.get_efs_dict()
    efs_id = efss.get(DEFAULT_NAME, '')
    efs_client = u.create_efs_client()
    if efs_id:
        try:
            # delete mount targets first
            print("About to delete %s (%s)" % (efs_id, DEFAULT_NAME))
            response = efs_client.describe_mount_targets(FileSystemId=efs_id)
            assert u.is_good_response(response)
            for mount_response in response['MountTargets']:
                subnet = ec2.Subnet(mount_response['SubnetId'])
                zone = subnet.availability_zone
                state = mount_response['LifeCycleState']
                id = mount_response['MountTargetId']
                ip = mount_response['IpAddress']
                sys.stdout.write('Deleting mount target %s ... ' % (id, ))
                sys.stdout.flush()
                response = efs_client.delete_mount_target(MountTargetId=id)
                print(response_type(response))

            sys.stdout.write('Deleting EFS %s (%s)... ' %
                             (efs_id, DEFAULT_NAME))
            sys.stdout.flush()
            u.delete_efs_id(efs_id)

        except Exception as e:
            sys.stdout.write('failed\n')
            u.loge(str(e) + '\n')

    if VPC_NAME in existing_vpcs:
        vpc = ec2.Vpc(existing_vpcs[VPC_NAME].id)
        print("Deleting VPC %s (%s) subresources:" % (VPC_NAME, vpc.id))

        for subnet in vpc.subnets.all():
            try:
                sys.stdout.write("Deleting subnet %s ... " % (subnet.id))
                sys.stdout.write(response_type(subnet.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        for gateway in vpc.internet_gateways.all():
            sys.stdout.write("Deleting gateway %s ... " % (gateway.id))
            # todo: if instances are using VPC, this fails with
            # botocore.exceptions.ClientError: An error occurred (DependencyViolation) when calling the DetachInternetGateway operation: Network vpc-ca4abab3 has some mapped public address(es). Please unmap those public address(es) before detaching the gateway.

            sys.stdout.write('detached ... ' if u.is_good_response(
                gateway.detach_from_vpc(VpcId=vpc.id)) else ' detach_failed ')
            sys.stdout.write('deleted ' if u.is_good_response(gateway.delete(
            )) else ' delete_failed ')
            sys.stdout.write('\n')

        def desc(route_table):
            return "%s (%s)" % (route_table.id, u.get_name(route_table.tags))

        for route_table in vpc.route_tables.all():
            sys.stdout.write("Deleting route table %s ... " %
                             (desc(route_table)))
            try:
                sys.stdout.write(response_type(route_table.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        def desc(security_group):
            return "%s (%s, %s)" % (security_group.id,
                                    u.get_name(security_group.tags),
                                    security_group.group_name)

        # TODO: this tries to remove default security group, maybe not remove it?
        for security_group in vpc.security_groups.all():
            sys.stdout.write('Deleting security group %s ... ' %
                             (desc(security_group)))
            try:
                sys.stdout.write(response_type(security_group.delete()) + '\n')
            except Exception as e:
                sys.stdout.write('failed\n')
                u.loge(str(e) + '\n')

        sys.stdout.write("Deleting VPC %s ... " % (vpc.id))
        sys.stdout.write(response_type(vpc.delete()) + '\n')

    # delete keypair
    keypairs = u.get_keypair_dict()
    keypair = keypairs.get(DEFAULT_NAME, '')
    if keypair:
        try:
            sys.stdout.write("Deleting keypair %s (%s) ... " %
                             (keypair.key_name, DEFAULT_NAME))
            sys.stdout.write(response_type(keypair.delete()) + '\n')
        except Exception as e:
            sys.stdout.write('failed\n')
            u.loge(str(e) + '\n')

    keypair_fn = u.get_keypair_fn(KEYPAIR_NAME)
    if os.path.exists(keypair_fn):
        print("Deleting local keypair file %s" % (keypair_fn, ))
        os.system('rm -f ' + keypair_fn)