def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): if self.gcp_log_name is None: exp_name = "{}-{}".format(self.gcp_log_prefix, EC2SpotDocker.make_timekey(self)) else: exp_name = self.gcp_log_name exp_prefix = self.gcp_log_prefix gcp_base_dir = os.path.join(self.gcp_log_path, exp_prefix.replace("_", "-"), exp_name) mnt_args = '' py_path = [] gcp_mount_info = [] local_mounts = [] for mount in mount_points: print('Handling mount: ', mount) if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects if mount.read_only: if mount.path_on_remote is None: with mount.gzip() as gzip_file: gzip_path = os.path.realpath(gzip_file) file_hash = utils.hash_file(gzip_path) gcp_path = upload_file_to_gcp_storage( bucket_name=self.gcp_bucket_name, file_name=gzip_path, remote_filename=file_hash + '.tar' ) mount.path_on_remote = gcp_path mount.local_file_hash = file_hash else: file_hash = mount.local_file_hash gcp_path = mount.path_on_remote remote_unpack_name = '/tmp/' + file_hash mount_point = os.path.join('/mounts', mount.mount_point.replace('~/', '')) mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename( mount.local_dir)), mount_point) if mount.pythonpath: py_path.append(mount_point) local_mounts.append(file_hash) else: raise ValueError() elif isinstance(mount, MountGCP): gcp_local_dir = mount.mount_point gcp_path = os.path.join(gcp_base_dir, mount.gcp_path) if not mount.output: raise NotImplementedError() gcp_mount_info.append( (gcp_local_dir, gcp_path, mount.include_string, mount.sync_interval) ) mnt_args += ' -v %s:%s' % (gcp_local_dir, mount.mount_point) else: raise NotImplementedError() docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) metadata = { 'bucket_name': self.gcp_bucket_name, 'docker_cmd': docker_cmd, 'docker_image': self.docker_image, 'local_mounts': json.dumps(local_mounts), 'gcp_mounts': json.dumps(gcp_mount_info), 'use_gpu': json.dumps(self.gpu), 'num_exps': self.num_exps, 'terminate': json.dumps(self.terminate), 'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(), 'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(), } # instance name must match regex'(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'"> unique_name = "doodad" + str(uuid.uuid4()).replace("-", "") self.create_instance(metadata, unique_name, exp_name, exp_prefix) if verbose: print(unique_name) print(metadata)
def launch_command(self, main_cmd, mount_points=None, dry=False, verbose=False): default_config = dict( image_id=self.image_id, instance_type=self.instance_type, key_name=self.aws_key_name, spot_price=self.spot_price, iam_instance_profile_name=self.iam_instance_profile_name, security_groups=self.security_groups, security_group_ids=self.security_group_ids, network_interfaces=[], ) aws_config = dict(default_config) if self.s3_log_name is None: exp_name = "{}-{}".format(self.s3_log_prefix, self.make_timekey()) else: exp_name = self.s3_log_name exp_prefix = self.s3_log_prefix s3_base_dir = os.path.join(self.aws_s3_path, exp_prefix.replace("_", "-"), exp_name) stdout_log_s3_path = os.path.join(s3_base_dir, 'stdout_$EC2_INSTANCE_ID.log') sio = StringIO() sio.write("#!/bin/bash\n") sio.write("truncate -s 0 /home/ubuntu/user_data.log\n") sio.write("{\n") sio.write( 'die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n') sio.write( 'EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n') sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region} """.format(exp_name=exp_name, aws_region=self.region)) sio.write(""" aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region} """.format(exp_prefix=exp_prefix, aws_region=self.region)) # Add swap file if self.gpu: swap_location = '/mnt/swapfile' else: swap_location = '/var/swap.1' sio.write( 'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n' .format(swap_location=swap_location, swap_size=self.swap_size)) sio.write( 'sudo mkswap {swap_location}\n'.format(swap_location=swap_location)) sio.write('sudo chmod 600 {swap_location}\n'.format( swap_location=swap_location)) sio.write( 'sudo swapon {swap_location}\n'.format(swap_location=swap_location)) sio.write("service docker start\n") sio.write( "docker --config /home/ubuntu/.docker pull {docker_image}\n".format( docker_image=self.docker_image)) sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format( aws_region=self.s3_bucket_region)) sio.write(""" curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip" sudo apt-get install unzip unzip awscli-bundle.zip sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws """) mnt_args = '' py_path = [] local_output_dir_and_s3_path = [] max_sync_interval = 0 for mount in mount_points: print('Handling mount: ', mount) if isinstance(mount, MountLocal): # TODO: these should be mount_s3 objects if mount.read_only: if mount.path_on_remote is None: with mount.gzip() as gzip_file: gzip_path = os.path.realpath(gzip_file) file_hash = utils.hash_file(gzip_path) s3_path = self.s3_upload(gzip_path, self.s3_bucket, remote_filename=file_hash + '.tar') mount.path_on_remote = s3_path mount.local_file_hash = gzip_path else: file_hash = mount.local_file_hash s3_path = mount.path_on_remote remote_tar_name = '/tmp/' + file_hash + '.tar' remote_unpack_name = '/tmp/' + file_hash sio.write("aws s3 cp {s3_path} {remote_tar_name}\n".format( s3_path=s3_path, remote_tar_name=remote_tar_name)) sio.write("mkdir -p {local_code_path}\n".format( local_code_path=remote_unpack_name)) sio.write( "tar -xvf {remote_tar_name} -C {local_code_path}\n".format( local_code_path=remote_unpack_name, remote_tar_name=remote_tar_name)) mount_point = os.path.join('/mounts', mount.mount_point.replace('~/', '')) mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name, os.path.basename( mount.local_dir)), mount_point) if mount.pythonpath: py_path.append(mount_point) else: raise ValueError() elif isinstance(mount, MountS3): # In theory the ec2_local_dir could be some random directory, # but we make it the same as the mount directory for # convenience. # # ec2_local_dir: directory visible to ec2 spot instance # moint_point: directory visible to docker running inside ec2 # spot instance ec2_local_dir = mount.mount_point s3_path = os.path.join(s3_base_dir, mount.s3_path) if self.num_exps == 1: stdout_log_s3_path = os.path.join(s3_path, 'stdout_$EC2_INSTANCE_ID.log') if not mount.output: raise NotImplementedError() local_output_dir_and_s3_path.append( (ec2_local_dir, s3_path) ) sio.write("mkdir -p {remote_dir}\n".format( remote_dir=ec2_local_dir) ) mnt_args += ' -v %s:%s' % (ec2_local_dir, mount.mount_point) # Sync interval sio.write(""" while /bin/true; do aws s3 sync --exclude '*' {include_string} {log_dir} {s3_path} sleep {periodic_sync_interval} done & echo sync initiated """.format( include_string=mount.include_string, log_dir=ec2_local_dir, s3_path=s3_path, periodic_sync_interval=mount.sync_interval )) max_sync_interval = max(max_sync_interval, mount.sync_interval) # Sync on terminate. This catches the case where the spot # instance gets terminated before the user script ends. # # This is hoping that there's at least 3 seconds between when # the spot instance gets marked for termination and when it # actually terminates. sio.write(""" while /bin/true; do if [ -z $(curl -Is http://169.254.169.254/latest/meta-data/spot/termination-time | head -1 | grep 404 | cut -d \ -f 2) ] then logger "Running shutdown hook." aws s3 cp --recursive {log_dir} {s3_path} aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} break else # Spot instance not yet marked for termination. # This is hoping that there's at least 3 seconds # between when the spot instance gets marked for # termination and when it actually terminates. sleep 3 fi done & echo log sync initiated """.format( log_dir=ec2_local_dir, s3_path=s3_path, stdout_log_s3_path=stdout_log_s3_path, )) else: raise NotImplementedError() sio.write(""" while /bin/true; do aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path} sleep {periodic_sync_interval} done & echo sync initiated """.format( stdout_log_s3_path=stdout_log_s3_path, periodic_sync_interval=max_sync_interval )) if self.gpu: sio.write("echo 'Testing nvidia-smi'\n") sio.write("nvidia-smi\n") sio.write("echo 'Testing nvidia-smi inside docker'\n") sio.write( "docker run --gpus all --rm {docker_image} nvidia-smi\n".format( docker_image=self.docker_image)) if self.checkpoint and self.checkpoint.restore: raise NotImplementedError() else: docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False, extra_args=mnt_args, pythonpath=py_path, use_docker_generated_name=True) assert self.num_exps > 0 for _ in range(self.num_exps - 1): sio.write(docker_cmd + ' &\n') sio.write(docker_cmd + '\n') # Sync all output mounts to s3 after running the user script # Ideally the earlier while loop would be sufficient, but it might be # the case that the earlier while loop isn't fast enough to catch a # termination. So, we explicitly sync on termination. for (local_output_dir, s3_dir_path) in local_output_dir_and_s3_path: sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format( local_dir=local_output_dir, s3_dir=s3_dir_path )) sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format( stdout_log_s3_path, )) # Wait for last sync if max_sync_interval > 0: sio.write("sleep {}\n".format(max_sync_interval + 5)) if self.terminate: sio.write(""" EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`" aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region} """.format(aws_region=self.region)) sio.write("} >> /home/ubuntu/user_data.log 2>&1\n") full_script = dedent(sio.getvalue()) import boto3 import botocore ec2 = boto3.client( "ec2", region_name=self.region, aws_access_key_id=self.credentials.aws_key, aws_secret_access_key=self.credentials.aws_secret_key, ) if len(full_script) > 10000 or len( base64.b64encode(full_script.encode()).decode("utf-8")) > 10000: s3_path = self.upload_file_to_s3(full_script, dry=dry) sio = StringIO() sio.write("#!/bin/bash\n") sio.write(""" aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\ chmod +x /home/ubuntu/remote_script.sh && \\ bash /home/ubuntu/remote_script.sh """.format(s3_path=s3_path, aws_region=self.s3_bucket_region)) user_data = dedent(sio.getvalue()) else: user_data = full_script if verbose: print(full_script) with open("/tmp/full_ec2_script", "w") as f: f.write(full_script) instance_args = dict( ImageId=aws_config["image_id"], KeyName=aws_config["key_name"], UserData=user_data, InstanceType=aws_config["instance_type"], EbsOptimized=False, SecurityGroups=aws_config["security_groups"], SecurityGroupIds=aws_config["security_group_ids"], NetworkInterfaces=aws_config["network_interfaces"], IamInstanceProfile=dict( Name=aws_config["iam_instance_profile_name"], ), # **config.AWS_EXTRA_CONFIGS, ) if self.extra_ec2_instance_kwargs is not None: instance_args.update(self.extra_ec2_instance_kwargs) if verbose: print( "************************************************************") print('UserData:', instance_args["UserData"]) print( "************************************************************") instance_args["UserData"] = base64.b64encode( instance_args["UserData"].encode()).decode("utf-8") spot_args = dict( DryRun=dry, InstanceCount=1, LaunchSpecification=instance_args, SpotPrice=aws_config["spot_price"], # ClientToken=params_list[0]["exp_name"], ) import pprint if verbose: pprint.pprint(spot_args) if not dry: response = ec2.request_spot_instances(**spot_args) print('Launched EC2 job - Server response:') pprint.pprint(response) print('*****' * 5) spot_request_id = response['SpotInstanceRequests'][ 0]['SpotInstanceRequestId'] for _ in range(10): try: ec2.create_tags( Resources=[spot_request_id], Tags=[ {'Key': 'Name', 'Value': exp_name} ], ) break except botocore.exceptions.ClientError: continue