コード例 #1
0
    def launch_command(self, main_cmd, mount_points=None, dry=False,
                       verbose=False):
        if self.gcp_log_name is None:
            exp_name = "{}-{}".format(self.gcp_log_prefix,
                                      EC2SpotDocker.make_timekey(self))
        else:
            exp_name = self.gcp_log_name
        exp_prefix = self.gcp_log_prefix
        gcp_base_dir = os.path.join(self.gcp_log_path,
                                    exp_prefix.replace("_", "-"), exp_name)

        mnt_args = ''
        py_path = []
        gcp_mount_info = []
        local_mounts = []
        for mount in mount_points:
            print('Handling mount: ', mount)
            if isinstance(mount,
                          MountLocal):  # TODO: these should be mount_s3 objects
                if mount.read_only:
                    if mount.path_on_remote is None:
                        with mount.gzip() as gzip_file:
                            gzip_path = os.path.realpath(gzip_file)
                            file_hash = utils.hash_file(gzip_path)
                            gcp_path = upload_file_to_gcp_storage(
                                bucket_name=self.gcp_bucket_name,
                                file_name=gzip_path,
                                remote_filename=file_hash + '.tar'
                            )
                        mount.path_on_remote = gcp_path
                        mount.local_file_hash = file_hash
                    else:
                        file_hash = mount.local_file_hash
                        gcp_path = mount.path_on_remote
                    remote_unpack_name = '/tmp/' + file_hash
                    mount_point = os.path.join('/mounts',
                                               mount.mount_point.replace('~/',
                                                                         ''))
                    mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name,
                                                            os.path.basename(
                                                                mount.local_dir)),
                                               mount_point)
                    if mount.pythonpath:
                        py_path.append(mount_point)
                    local_mounts.append(file_hash)
                else:
                    raise ValueError()
            elif isinstance(mount, MountGCP):
                gcp_local_dir = mount.mount_point
                gcp_path = os.path.join(gcp_base_dir, mount.gcp_path)
                if not mount.output:
                    raise NotImplementedError()
                gcp_mount_info.append(
                    (gcp_local_dir, gcp_path, mount.include_string,
                     mount.sync_interval)
                )
                mnt_args += ' -v %s:%s' % (gcp_local_dir, mount.mount_point)
            else:
                raise NotImplementedError()

        docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False,
                                         extra_args=mnt_args,
                                         pythonpath=py_path,
                                         use_docker_generated_name=True)

        metadata = {
            'bucket_name': self.gcp_bucket_name,
            'docker_cmd': docker_cmd,
            'docker_image': self.docker_image,
            'local_mounts': json.dumps(local_mounts),
            'gcp_mounts': json.dumps(gcp_mount_info),
            'use_gpu': json.dumps(self.gpu),
            'num_exps': self.num_exps,
            'terminate': json.dumps(self.terminate),
            'startup-script': open(GCP_STARTUP_SCRIPT_PATH, "r").read(),
            'shutdown-script': open(GCP_SHUTDOWN_SCRIPT_PATH, "r").read(),
        }
        # instance name must match regex'(?:[a-z](?:[-a-z0-9]{0,61}[a-z0-9])?)'">
        unique_name = "doodad" + str(uuid.uuid4()).replace("-", "")
        self.create_instance(metadata, unique_name, exp_name, exp_prefix)
        if verbose:
            print(unique_name)
            print(metadata)
コード例 #2
0
    def launch_command(self, main_cmd, mount_points=None, dry=False,
                       verbose=False):
        default_config = dict(
            image_id=self.image_id,
            instance_type=self.instance_type,
            key_name=self.aws_key_name,
            spot_price=self.spot_price,
            iam_instance_profile_name=self.iam_instance_profile_name,
            security_groups=self.security_groups,
            security_group_ids=self.security_group_ids,
            network_interfaces=[],
        )
        aws_config = dict(default_config)
        if self.s3_log_name is None:
            exp_name = "{}-{}".format(self.s3_log_prefix, self.make_timekey())
        else:
            exp_name = self.s3_log_name
        exp_prefix = self.s3_log_prefix
        s3_base_dir = os.path.join(self.aws_s3_path,
                                   exp_prefix.replace("_", "-"), exp_name)
        stdout_log_s3_path = os.path.join(s3_base_dir,
                                          'stdout_$EC2_INSTANCE_ID.log')

        sio = StringIO()
        sio.write("#!/bin/bash\n")
        sio.write("truncate -s 0 /home/ubuntu/user_data.log\n")
        sio.write("{\n")
        sio.write(
            'die() { status=$1; shift; echo "FATAL: $*"; exit $status; }\n')
        sio.write(
            'EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id`"\n')
        sio.write("""
            aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=Name,Value={exp_name} --region {aws_region}
        """.format(exp_name=exp_name, aws_region=self.region))
        sio.write("""
            aws ec2 create-tags --resources $EC2_INSTANCE_ID --tags Key=exp_prefix,Value={exp_prefix} --region {aws_region}
        """.format(exp_prefix=exp_prefix, aws_region=self.region))

        # Add swap file
        if self.gpu:
            swap_location = '/mnt/swapfile'
        else:
            swap_location = '/var/swap.1'
        sio.write(
            'sudo dd if=/dev/zero of={swap_location} bs=1M count={swap_size}\n'
                .format(swap_location=swap_location, swap_size=self.swap_size))
        sio.write(
            'sudo mkswap {swap_location}\n'.format(swap_location=swap_location))
        sio.write('sudo chmod 600 {swap_location}\n'.format(
            swap_location=swap_location))
        sio.write(
            'sudo swapon {swap_location}\n'.format(swap_location=swap_location))

        sio.write("service docker start\n")
        sio.write(
            "docker --config /home/ubuntu/.docker pull {docker_image}\n".format(
                docker_image=self.docker_image))
        sio.write("export AWS_DEFAULT_REGION={aws_region}\n".format(
            aws_region=self.s3_bucket_region))
        sio.write("""
            curl "https://s3.amazonaws.com/aws-cli/awscli-bundle.zip" -o "awscli-bundle.zip"
            sudo apt-get install unzip
            unzip awscli-bundle.zip
            sudo ./awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws
        """)

        mnt_args = ''
        py_path = []
        local_output_dir_and_s3_path = []
        max_sync_interval = 0
        for mount in mount_points:
            print('Handling mount: ', mount)
            if isinstance(mount,
                          MountLocal):  # TODO: these should be mount_s3 objects
                if mount.read_only:
                    if mount.path_on_remote is None:
                        with mount.gzip() as gzip_file:
                            gzip_path = os.path.realpath(gzip_file)
                            file_hash = utils.hash_file(gzip_path)
                            s3_path = self.s3_upload(gzip_path, self.s3_bucket,
                                                     remote_filename=file_hash + '.tar')
                        mount.path_on_remote = s3_path
                        mount.local_file_hash = gzip_path
                    else:
                        file_hash = mount.local_file_hash
                        s3_path = mount.path_on_remote
                    remote_tar_name = '/tmp/' + file_hash + '.tar'
                    remote_unpack_name = '/tmp/' + file_hash
                    sio.write("aws s3 cp {s3_path} {remote_tar_name}\n".format(
                        s3_path=s3_path, remote_tar_name=remote_tar_name))
                    sio.write("mkdir -p {local_code_path}\n".format(
                        local_code_path=remote_unpack_name))
                    sio.write(
                        "tar -xvf {remote_tar_name} -C {local_code_path}\n".format(
                            local_code_path=remote_unpack_name,
                            remote_tar_name=remote_tar_name))
                    mount_point = os.path.join('/mounts',
                                               mount.mount_point.replace('~/',
                                                                         ''))
                    mnt_args += ' -v %s:%s' % (os.path.join(remote_unpack_name,
                                                            os.path.basename(
                                                                mount.local_dir)),
                                               mount_point)
                    if mount.pythonpath:
                        py_path.append(mount_point)
                else:
                    raise ValueError()
            elif isinstance(mount, MountS3):
                # In theory the ec2_local_dir could be some random directory,
                # but we make it the same as the mount directory for
                # convenience.
                #
                # ec2_local_dir: directory visible to ec2 spot instance
                # moint_point: directory visible to docker running inside ec2
                #               spot instance
                ec2_local_dir = mount.mount_point
                s3_path = os.path.join(s3_base_dir, mount.s3_path)
                if self.num_exps == 1:
                    stdout_log_s3_path = os.path.join(s3_path,
                                                      'stdout_$EC2_INSTANCE_ID.log')
                if not mount.output:
                    raise NotImplementedError()
                local_output_dir_and_s3_path.append(
                    (ec2_local_dir, s3_path)
                )
                sio.write("mkdir -p {remote_dir}\n".format(
                    remote_dir=ec2_local_dir)
                )
                mnt_args += ' -v %s:%s' % (ec2_local_dir, mount.mount_point)

                # Sync interval
                sio.write("""
                while /bin/true; do
                    aws s3 sync --exclude '*' {include_string} {log_dir} {s3_path}
                    sleep {periodic_sync_interval}
                done & echo sync initiated
                """.format(
                    include_string=mount.include_string,
                    log_dir=ec2_local_dir,
                    s3_path=s3_path,
                    periodic_sync_interval=mount.sync_interval
                ))
                max_sync_interval = max(max_sync_interval, mount.sync_interval)

                # Sync on terminate. This catches the case where the spot
                # instance gets terminated before the user script ends.
                #
                # This is hoping that there's at least 3 seconds between when
                # the spot instance gets marked for  termination and when it
                # actually terminates.
                sio.write("""
                    while /bin/true; do
                        if [ -z $(curl -Is http://169.254.169.254/latest/meta-data/spot/termination-time | head -1 | grep 404 | cut -d \  -f 2) ]
                        then
                            logger "Running shutdown hook."
                            aws s3 cp --recursive {log_dir} {s3_path}
                            aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path}
                            break
                        else
                            # Spot instance not yet marked for termination.
                            # This is hoping that there's at least 3 seconds
                            # between when the spot instance gets marked for
                            # termination and when it actually terminates.
                            sleep 3
                        fi
                    done & echo log sync initiated
                """.format(
                    log_dir=ec2_local_dir,
                    s3_path=s3_path,
                    stdout_log_s3_path=stdout_log_s3_path,
                ))
            else:
                raise NotImplementedError()

        sio.write("""
        while /bin/true; do
            aws s3 cp /home/ubuntu/user_data.log {stdout_log_s3_path}
            sleep {periodic_sync_interval}
        done & echo sync initiated
        """.format(
            stdout_log_s3_path=stdout_log_s3_path,
            periodic_sync_interval=max_sync_interval
        ))

        if self.gpu:
            sio.write("echo 'Testing nvidia-smi'\n")
            sio.write("nvidia-smi\n")
            sio.write("echo 'Testing nvidia-smi inside docker'\n")
            sio.write(
                "docker run --gpus all --rm {docker_image} nvidia-smi\n".format(
                    docker_image=self.docker_image))

        if self.checkpoint and self.checkpoint.restore:
            raise NotImplementedError()
        else:
            docker_cmd = self.get_docker_cmd(main_cmd, use_tty=False,
                                             extra_args=mnt_args,
                                             pythonpath=py_path,
                                             use_docker_generated_name=True)
        assert self.num_exps > 0
        for _ in range(self.num_exps - 1):
            sio.write(docker_cmd + ' &\n')
        sio.write(docker_cmd + '\n')

        # Sync all output mounts to s3 after running the user script
        # Ideally the earlier while loop would be sufficient, but it might be
        # the case that the earlier while loop isn't fast enough to catch a
        # termination. So, we explicitly sync on termination.
        for (local_output_dir, s3_dir_path) in local_output_dir_and_s3_path:
            sio.write("aws s3 cp --recursive {local_dir} {s3_dir}\n".format(
                local_dir=local_output_dir,
                s3_dir=s3_dir_path
            ))
        sio.write("aws s3 cp /home/ubuntu/user_data.log {}\n".format(
            stdout_log_s3_path,
        ))

        # Wait for last sync
        if max_sync_interval > 0:
            sio.write("sleep {}\n".format(max_sync_interval + 5))

        if self.terminate:
            sio.write("""
                EC2_INSTANCE_ID="`wget -q -O - http://169.254.169.254/latest/meta-data/instance-id || die \"wget instance-id has failed: $?\"`"
                aws ec2 terminate-instances --instance-ids $EC2_INSTANCE_ID --region {aws_region}
            """.format(aws_region=self.region))
        sio.write("} >> /home/ubuntu/user_data.log 2>&1\n")

        full_script = dedent(sio.getvalue())
        import boto3
        import botocore
        ec2 = boto3.client(
            "ec2",
            region_name=self.region,
            aws_access_key_id=self.credentials.aws_key,
            aws_secret_access_key=self.credentials.aws_secret_key,
        )

        if len(full_script) > 10000 or len(
                base64.b64encode(full_script.encode()).decode("utf-8")) > 10000:
            s3_path = self.upload_file_to_s3(full_script, dry=dry)
            sio = StringIO()
            sio.write("#!/bin/bash\n")
            sio.write("""
            aws s3 cp {s3_path} /home/ubuntu/remote_script.sh --region {aws_region} && \\
            chmod +x /home/ubuntu/remote_script.sh && \\
            bash /home/ubuntu/remote_script.sh
            """.format(s3_path=s3_path, aws_region=self.s3_bucket_region))
            user_data = dedent(sio.getvalue())
        else:
            user_data = full_script

        if verbose:
            print(full_script)
            with open("/tmp/full_ec2_script", "w") as f:
                f.write(full_script)

        instance_args = dict(
            ImageId=aws_config["image_id"],
            KeyName=aws_config["key_name"],
            UserData=user_data,
            InstanceType=aws_config["instance_type"],
            EbsOptimized=False,
            SecurityGroups=aws_config["security_groups"],
            SecurityGroupIds=aws_config["security_group_ids"],
            NetworkInterfaces=aws_config["network_interfaces"],
            IamInstanceProfile=dict(
                Name=aws_config["iam_instance_profile_name"],
            ),
            # **config.AWS_EXTRA_CONFIGS,
        )
        if self.extra_ec2_instance_kwargs is not None:
            instance_args.update(self.extra_ec2_instance_kwargs)

        if verbose:
            print(
                "************************************************************")
            print('UserData:', instance_args["UserData"])
            print(
                "************************************************************")
        instance_args["UserData"] = base64.b64encode(
            instance_args["UserData"].encode()).decode("utf-8")
        spot_args = dict(
            DryRun=dry,
            InstanceCount=1,
            LaunchSpecification=instance_args,
            SpotPrice=aws_config["spot_price"],
            # ClientToken=params_list[0]["exp_name"],
        )

        import pprint

        if verbose:
            pprint.pprint(spot_args)
        if not dry:
            response = ec2.request_spot_instances(**spot_args)
            print('Launched EC2 job - Server response:')
            pprint.pprint(response)
            print('*****' * 5)
            spot_request_id = response['SpotInstanceRequests'][
                0]['SpotInstanceRequestId']
            for _ in range(10):
                try:
                    ec2.create_tags(
                        Resources=[spot_request_id],
                        Tags=[
                            {'Key': 'Name', 'Value': exp_name}
                        ],
                    )
                    break
                except botocore.exceptions.ClientError:
                    continue