def __init__(self, instance, job, task_id, install_script=None, user_data='', linux_type=None, skip_efs_mount=False): self.initialize_called = False self.instance = instance self.job = job self.id = task_id if user_data: assert user_data.startswith('#!/bin/bash') self.install_script = install_script self.user_data = user_data self.linux_type = linux_type self._run_counter = 0 self.cached_ip = None self.cached_public_ip = None self.skip_efs_mount = skip_efs_mount self.name = u.format_task_name(task_id, job.name) # TODO, make below actually mean stuff (also, run_command_available) self.initialized = False # scratch is client-local space for temporary files # TODO: this job.name already contains name of run, the directory # below uses run name twice self.scratch = "{}/{}.{}.{}.{}/scratch".format(TASKDIR_PREFIX, job._run.name, job.name, self.id, 0) # u.now_micros()) self.remote_scratch = '/tmp/tmux' # self.log("Creating local scratch dir %s", self.scratch) self._ossystem('rm -Rf ' + self.scratch) # TODO: don't delete this? self._ossystem('mkdir -p ' + self.scratch) # os.chdir(self.scratch) # todo: create taskdir self.connect_instructions = "waiting for initialize()" self.keypair_fn = u.get_keypair_fn() # username to use to ssh into instances # ec2-user or ubuntu if linux_type == 'ubuntu': self.username = '******' elif linux_type == 'amazon': self.username = '******' else: assert False, "Unknown linux type '%s', expected 'ubuntu' or 'amazon'." self.taskdir = '/home/' + self.username
def make_job(self, role_name, num_tasks=1, skip_existing_job_validation=False, **kwargs): """skip_existing_job_validation: if True, doesn't check that existing job on server has same number of tasks as requested.""" # u.maybe_create_resources() assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instance_type = kwargs['instance_type'] instances = u.lookup_aws_instances(job_name, instance_type=instance_type) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs.get('ami', '') ami_name = kwargs.get('ami_name', '') availability_zone = kwargs.get('availability_zone', '') if not availability_zone: availability_zone = os.environ['ZONE'] placement_group = kwargs.get('placement_group', '') # automatically generated placement_group_name use_placement_group = kwargs.get('use_placement_group', False) assert use_placement_group == False or placement_group == '' if use_placement_group: placement_group = self.placement_group_name install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') # TODO: use heuristics to tell linux type from AMI name user_data = kwargs.get('user_data', '') if user_data: assert user_data.startswith('#!/bin/bash') ebs = kwargs.get('ebs', '') use_spot = kwargs.get('use_spot', False) monitoring = kwargs.get('monitoring', True) # always install tmux on Amazon linux types # TODO: has no effect for some reason # https://console.aws.amazon.com/support/v1?region=us-west-2#/case/?displayId=5256445351&language=en if linux_type == 'amazon': user_data += 'sudo yum install tmux -y' if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: if not skip_existing_job_validation: assert len(instances) == num_tasks, ( "Found job with same name %s(%s), but number of tasks %d doesn't match requested %d, kill job manually." % (job_name, instances[0].state, len(instances), num_tasks)) print("Found existing job " + job_name) starting_instances = False for i in instances: if i.state['Name'] == 'stopped': i.start() starting_instances = True # TODO: replace with proper wait loop if starting_instances: while True: print("Waiting forever for instances to start") time.sleep(10) print(instances) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) assert not ( ami and ami_name ), "Must have only one of ami and ami_name, got " + ami + ", " + ami_name assert ami or ami_name, "Must specify at least one of ami and ami_name" if ami_name: ami = u.lookup_ami_id(ami_name).id security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION=%s)" % ( availability_zone, region, ', '.join( subnet_dict.keys()), availability_zone[:-1]) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # storage setup if ebs: args['BlockDeviceMappings'] = ebs # network setup # TODO: get rid of zone? Zone seems to be required for constructor # that allows to enable AssociatePublicIpAddress field args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg if monitoring: args['Monitoring'] = {'Enabled': True} args['UserData'] = user_data if use_spot: instances = u.create_spot_instances(args) else: try: instances = ec2.create_instances(**args) except Exception as e: print(f"Instance creation failed with ({e})") print("Account number: ", u.get_account_number()) print("Region: ", u.get_region()) sys.exit() assert instances assert len(instances) == num_tasks # TODO: make instances match their launch indices. This way # tasks can figure out which # they are for (task_num, instance) in enumerate(instances): while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" # task_name = u.format_task_name(instance.ami_launch_index, role_name, # self.name) task_name = u.format_task_name(task_num, job_name) instance.create_tags(Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job
def make_job(self, role_name, num_tasks=1, **kwargs): assert num_tasks >= 0 # TODO: document launch parameters job_name = u.format_job_name(role_name, self.name) instances = u.lookup_aws_instances(job_name) kwargs = u.merge_kwargs(kwargs, self.kwargs) ami = kwargs['ami'] instance_type = kwargs['instance_type'] availability_zone = kwargs['availability_zone'] placement_group = kwargs.get('placement_group', '') install_script = kwargs.get('install_script', '') skip_efs_mount = kwargs.get('skip_efs_mount', False) linux_type = kwargs.get('linux_type', 'ubuntu') user_data = kwargs.get('user_data', '') if user_data: user_data += '\necho userdata_ok >> /tmp/is_initialized\n' # print("Using user_data", user_data) # TODO: also make sure instance type is the same if instances: assert len(instances) == num_tasks, ( "Found job with same name, but number of tasks %d doesn't match requested %d, kill job manually." % (len(instances), num_tasks)) print("Found existing job " + job_name) else: print("Launching new job %s into VPC %s" % (job_name, u.get_resource_name())) security_group = u.get_security_group_dict()[u.get_resource_name()] keypair = u.get_keypair_dict()[u.get_keypair_name()] vpc = u.get_vpc_dict()[u.get_resource_name()] subnet_dict = u.get_subnet_dict(vpc) region = u.get_region() assert availability_zone in subnet_dict, "Availability zone %s is not in subnet dict for current AWS default region %s, available subnets are %s. (hint, set AWS_DEFAULT_REGION)" % ( availability_zone, region, ', '.join(subnet_dict.keys())) subnet = subnet_dict[availability_zone] ec2 = u.create_ec2_resource() u.maybe_create_placement_group(placement_group) self.log("Requesting %d %s" % (num_tasks, instance_type)) args = { 'ImageId': ami, 'InstanceType': instance_type, 'MinCount': num_tasks, 'MaxCount': num_tasks, 'KeyName': keypair.name } # network setup args['NetworkInterfaces'] = [{ 'SubnetId': subnet.id, 'DeviceIndex': 0, 'AssociatePublicIpAddress': True, 'Groups': [security_group.id] }] placement_arg = {'AvailabilityZone': availability_zone} if placement_group: placement_arg['GroupName'] = placement_group args['Placement'] = placement_arg args['UserData'] = user_data instances = ec2.create_instances(**args) assert len(instances) == num_tasks # assign proper names to tasks for instance in instances: while True: try: # sometimes get "An error occurred (InvalidInstanceID.NotFound)" task_name = u.format_task_name( instance.ami_launch_index, role_name, self.name) # TODO: use instance.create_tags instead like in create_resources.py ec2.create_tags(Resources=[instance.id], Tags=u.make_name(task_name)) break except Exception as e: self.log( "create_tags failed with %s, retrying in %d seconds" % (str(e), TIMEOUT_SEC)) time.sleep(TIMEOUT_SEC) job = Job(self, job_name, instances=instances, install_script=install_script, linux_type=linux_type, user_data=user_data, skip_efs_mount=skip_efs_mount) self.jobs.append(job) return job