def checksum_and_upload_file(
            self,
            local_path: str,
            remote_path: str,
            content_type: str = "application/octet-stream",
            metadata: typing.Dict[str, str] = None,
            *args,
            **kwargs) -> None:
        if metadata is None:
            metadata = dict()

        fpath = os.path.join(self.local_root, local_path)
        size = os.path.getsize(fpath)
        chunk_size = get_s3_multipart_chunk_size(size)
        with ChecksummingSink(write_chunk_size=chunk_size) as sink, open(
                fpath, "rb") as fh:
            data = fh.read()
            sink.write(data)

            sums = sink.get_checksums()

        metadata['hca-dss-crc32c'] = sums['crc32c'].lower()
        metadata['hca-dss-s3_etag'] = sums['s3_etag'].lower()
        metadata['hca-dss-sha1'] = sums['sha1'].lower()
        metadata['hca-dss-sha256'] = sums['sha256'].lower()

        self.upload_file(local_path, remote_path, content_type, metadata,
                         *args, **kwargs)  # noqa
class TestChecksummingBufferedReader(unittest.TestCase):

    file_size = os.path.getsize(TEST_FILE)
    chunk_size = s3_multipart.get_s3_multipart_chunk_size(file_size)

    def check_sums(self, checksums):
        self.assertEqual(checksums['sha1'], TEST_FILE_CHECKSUMS['sha1'])
        self.assertEqual(checksums['sha256'], TEST_FILE_CHECKSUMS['sha256'])
        self.assertEqual(checksums['crc32c'].lower(),
                         TEST_FILE_CHECKSUMS['crc32c'].lower())
        self.assertEqual(checksums['s3_etag'], TEST_FILE_CHECKSUMS['s3_etag'])

    def test_checksums_after_single_read(self):
        with io.open(TEST_FILE, 'rb') as fh:
            reader = ChecksummingBufferedReader(fh, self.chunk_size)
            reader.read()
            sums = reader.get_checksums()
            self.check_sums(sums)

    def test_checksums_after_multiple_reads(self):
        with io.open(TEST_FILE, 'rb') as raw_fh:
            reader = ChecksummingBufferedReader(raw_fh, self.chunk_size)
            while True:
                buf = reader.read(self.chunk_size)
                if not buf:
                    break
            sums = reader.get_checksums()
            self.check_sums(sums)
    def _upload_local_file_to_staging(self, path: str, file_uuid: str,
                                      content_type):
        """
        Upload a local file to the staging bucket, computing the DSS-required checksums
        in the process, then tag the file in the staging bucket with the checksums.
        This is in preparation from subsequently uploading the file from the staging
        bucket into the DSS.

        :param path: Path to a local file.
        :param file_uuid: An RFC4122-compliant UUID to be used to identify the file.
        :param content_type: Content description, for example: "application/json; dss-type=fileref".
        :return: file_uuid: str, key_name: str
        """
        def _encode_tags(tags):
            return [dict(Key=k, Value=v) for k, v in tags.items()]

        def _mime_type(filename):
            type_, encoding = mimetypes.guess_type(filename)
            if encoding:
                return encoding
            if type_:
                return type_
            return "application/octet-stream"

        file_size = os.path.getsize(path)
        multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(
            file_size)
        tx_cfg = TransferConfig(
            multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
            multipart_chunksize=multipart_chunksize)
        s3 = boto3.resource("s3")

        destination_bucket = s3.Bucket(self.staging_bucket)
        with open(path, "rb") as file_handle, ChecksummingBufferedReader(
                file_handle, multipart_chunksize) as fh:
            key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name))
            destination_bucket.upload_fileobj(
                fh,
                key_name,
                Config=tx_cfg,
                ExtraArgs={
                    'ContentType':
                    content_type
                    if content_type is not None else _mime_type(fh.raw.name)
                })
            sums = fh.get_checksums()
            metadata = {
                "hca-dss-s3_etag": sums["s3_etag"],
                "hca-dss-sha1": sums["sha1"],
                "hca-dss-sha256": sums["sha256"],
                "hca-dss-crc32c": sums["crc32c"],
            }

            s3.meta.client.put_object_tagging(
                Bucket=destination_bucket.name,
                Key=key_name,
                Tagging=dict(TagSet=_encode_tags(metadata)))
        return file_uuid, key_name
Beispiel #4
0
def upload_to_cloud(file_handles, staging_bucket, replica, from_cloud=False):
    """
    Upload files to cloud.

    :param file_handles: If from_cloud, file_handles is a aws s3 directory path to files with appropriate
                         metadata uploaded. Else, a list of binary file_handles to upload.
    :param staging_bucket: The aws bucket to upload the files to.
    :param replica: The cloud replica to write to. One of 'aws', 'gc', or 'azure'. No functionality now.
    :return: a list of each file's unique key name.
    """
    s3 = boto3.resource("s3")
    file_uuids = []
    key_names = []

    if from_cloud:
        file_uuids, key_names = _copy_from_s3(file_handles[0], s3)
    else:
        destination_bucket = s3.Bucket(staging_bucket)
        for raw_fh in file_handles:
            file_size = os.path.getsize(raw_fh.name)
            multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(
                file_size)
            tx_cfg = TransferConfig(
                multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
                multipart_chunksize=multipart_chunksize)
            with ChecksummingBufferedReader(raw_fh, multipart_chunksize) as fh:
                file_uuid = str(uuid.uuid4())
                key_name = "{}/{}".format(file_uuid,
                                          os.path.basename(fh.raw.name))
                destination_bucket.upload_fileobj(fh,
                                                  key_name,
                                                  Config=tx_cfg,
                                                  ExtraArgs={
                                                      'ContentType':
                                                      _mime_type(fh.raw.name),
                                                  })
                sums = fh.get_checksums()
                metadata = {
                    "hca-dss-s3_etag": sums["s3_etag"],
                    "hca-dss-sha1": sums["sha1"],
                    "hca-dss-sha256": sums["sha256"],
                    "hca-dss-crc32c": sums["crc32c"],
                }
                s3.meta.client.put_object_tagging(
                    Bucket=destination_bucket.name,
                    Key=key_name,
                    Tagging=dict(TagSet=encode_tags(metadata)))
                file_uuids.append(file_uuid)
                key_names.append(key_name)

    return file_uuids, key_names
 def _compute_checksums(self, progress_callback=None):
     multipart_chunksize = get_s3_multipart_chunk_size(
         self._s3obj.content_length)
     with ChecksummingSink(multipart_chunksize) as sink:
         self._s3client.download_fileobj(self._s3obj.bucket_name,
                                         self._s3obj.key,
                                         sink,
                                         Callback=progress_callback,
                                         Config=self._transfer_config())
         checksums = sink.get_checksums()
         if len(DssChecksums.CHECKSUM_NAMES) != len(checksums):
             error = f"checksums {checksums} for {self._s3obj.key} do not meet requirements"
             raise UploadException(status=500,
                                   title=error,
                                   detail=str(checksums))
         return checksums
 def compute(self):
     """ Compute the checksum(s) for the given file and return a map of the value by the hash function name. """
     start_time = time.time()
     _file_size = os.path.getsize(self._filename)
     _multipart_chunksize = get_s3_multipart_chunk_size(_file_size)
     with ChecksummingSink(_multipart_chunksize,
                           hash_functions=self._checksums) as sink:
         with open(self._filename, 'rb') as _file_object:
             data = _file_object.read(_multipart_chunksize)
             while data:
                 sink.write(data)
                 data = _file_object.read(_multipart_chunksize)
         checksums = sink.get_checksums()
         print("Checksumming took %.2f milliseconds to compute" %
               ((time.time() - start_time) * 1000))
     return checksums
Beispiel #7
0
    def upload_file(app, contents, replica):
        src_key = generate_test_key()
        encoded = json.dumps(contents).encode()
        chunk_size = get_s3_multipart_chunk_size(len(encoded))
        with io.BytesIO(encoded) as fh, ChecksummingSink(
                write_chunk_size=chunk_size) as sink:
            sink.write(fh.read())
            sums = sink.get_checksums()
            metadata = {
                'hca-dss-crc32c': sums['crc32c'].lower(),
                'hca-dss-s3_etag': sums['s3_etag'].lower(),
                'hca-dss-sha1': sums['sha1'].lower(),
                'hca-dss-sha256': sums['sha256'].lower()
            }
            fh.seek(0)

            if replica == 'gcp':
                gs_test_bucket = get_env("DSS_GS_BUCKET_TEST")
                gcp_client = gs_storage.Client.from_service_account_json(
                    os.getenv("GOOGLE_APPLICATION_CREDENTIALS"))
                gs_bucket = gcp_client.bucket(gs_test_bucket)
                blob = gs_bucket.blob(src_key)
                blob.upload_from_file(fh, content_type="application/json")
                blob.metadata = metadata
                blob.patch()
                source_url = f"gs://{gs_test_bucket}/{src_key}"

            if replica == 'aws':
                # TODO: consider switching to unmanaged uploader (putobject w/blob)
                s3_test_bucket = get_env("DSS_S3_BUCKET_TEST")
                s3 = boto3.resource('s3')
                s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj(
                    fh, ExtraArgs={"Metadata": metadata})
                source_url = f"s3://{s3_test_bucket}/{src_key}"

        file_uuid = str(uuid4())
        version = datetime_to_version_format(datetime.utcnow())
        urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid)
        urlbuilder.add_query("version", version)

        resp_obj = app.put(str(urlbuilder),
                           json=dict(creator_uid=0, source_url=source_url),
                           headers=get_auth_header())
        resp_obj.raise_for_status()
        return file_uuid, resp_obj.json()["version"]
    def upload_file(self,
                    local_path: str,
                    remote_path: str,
                    content_type: str = "application/octet-stream",
                    metadata_keys: typing.Dict[str, str] = None,
                    tags: typing.Dict[str, str] = None,
                    s3_part_size: int = None,
                    *args,
                    **kwargs) -> None:
        if metadata_keys is None:
            metadata_keys = dict()
        if tags is None:
            tags = dict()

        fp = os.path.join(self.local_root, local_path)
        sz = os.stat(fp).st_size

        if s3_part_size is None:
            s3_part_size = get_s3_multipart_chunk_size(sz)

        transfer_config = TransferConfig(
            multipart_threshold=MULTIPART_THRESHOLD,
            multipart_chunksize=s3_part_size,
        )

        logger.info(
            f"Uploading {local_path} to s3://{self.bucket}/{remote_path}")
        self.s3_client.upload_file(
            fp,
            self.bucket,
            remote_path,
            ExtraArgs={
                "Metadata": metadata_keys,
                "ContentType": content_type,
            },
            Config=transfer_config,
        )

        tagset = dict(TagSet=[])  # type: typing.Dict[str, typing.List[dict]]
        for tag_key, tag_value in tags.items():
            tagset['TagSet'].append(dict(Key=tag_key, Value=tag_value))
        self.s3_client.put_object_tagging(Bucket=self.bucket,
                                          Key=remote_path,
                                          Tagging=tagset)
Beispiel #9
0
 def _stage_file(self, key, size=7):
     assert size < MULTIPART_THRESHOLD
     data = os.urandom(size)
     chunk_size = get_s3_multipart_chunk_size(size)
     with ChecksummingSink(write_chunk_size=chunk_size) as sink:
         sink.write(data)
         sums = sink.get_checksums()
     metadata = {
         'hca-dss-crc32c': sums['crc32c'].lower(),
         'hca-dss-s3_etag': sums['s3_etag'].lower(),
         'hca-dss-sha1': sums['sha1'].lower(),
         'hca-dss-sha256': sums['sha256'].lower(),
     }
     fh = io.BytesIO(data)
     blob = self.s3.Bucket(os.environ['DSS_S3_BUCKET_TEST']).Object(key)
     blob.upload_fileobj(fh,
                         ExtraArgs=dict(
                             Metadata=metadata,
                             ContentType="application/octet-stream"))
     return f"s3://{os.environ['DSS_S3_BUCKET_TEST']}/{key}"
Beispiel #10
0
    def _checksum_and_stage_file(self,
                                 file_handle: typing.BinaryIO,
                                 size: int,
                                 content_type: str = "application/octet-stream"):
        key = f"staging/{uuid4()}"
        chunk_size = get_s3_multipart_chunk_size(size)
        with ChecksummingSink(write_chunk_size=chunk_size) as sink:
            data = file_handle.read()
            sink.write(data)
            sums = sink.get_checksums()

        metadata = dict()
        metadata['hca-dss-crc32c'] = sums['crc32c'].lower()
        metadata['hca-dss-s3_etag'] = sums['s3_etag'].lower()
        metadata['hca-dss-sha1'] = sums['sha1'].lower()
        metadata['hca-dss-sha256'] = sums['sha256'].lower()

        with io.BytesIO(data) as fh:
            self.handle.upload_file_handle(self.staging_bucket, key, fh, content_type, metadata)

        return f"gs://{self.staging_bucket}/{key}"
Beispiel #11
0
        def compute(self):
            """ Compute the checksum(s) for the given file and return a map of the value by the hash function name. """
            start_time = time.time()
            if self._data:
                with ChecksummingSink(self._data_size,
                                      hash_functions=self._checksums) as sink:
                    sink.write(self._data)
                    checksums = sink.get_checksums()
            elif self._filename:
                _multipart_chunksize = get_s3_multipart_chunk_size(
                    self._data_size)
                with ChecksummingSink(_multipart_chunksize,
                                      hash_functions=self._checksums) as sink:
                    with open(self._filename, 'rb') as _file_object:
                        while True:
                            data = _file_object.read(_multipart_chunksize)
                            if not data:
                                break
                            sink.write(data)
                    checksums = sink.get_checksums()

            logger.info("Checksumming took %.2f milliseconds to compute" %
                        ((time.time() - start_time) * 1000))
            return checksums
Beispiel #12
0
def setup_copy_task(event, lambda_context):
    source_bucket = event[Key.SOURCE_BUCKET]
    source_key = event[Key.SOURCE_KEY]
    destination_bucket = event[Key.DESTINATION_BUCKET]
    destination_key = event[Key.DESTINATION_KEY]
    s3_blobstore = S3BlobStore.from_environment()
    blobinfo = s3_blobstore.get_all_metadata(source_bucket, source_key)
    source_etag = blobinfo['ETag'].strip(
        "\"")  # the ETag is returned with an extra set of quotes.
    source_size = blobinfo['ContentLength']  # type: int
    part_size = get_s3_multipart_chunk_size(source_size)
    part_count = source_size // part_size
    if part_count * part_size < source_size:
        part_count += 1
    if part_count > 1:
        mpu = s3_blobstore.s3_client.create_multipart_upload(
            Bucket=destination_bucket,
            Key=destination_key,
            ContentType=blobinfo['ContentType'])
        event[_Key.UPLOAD_ID] = mpu['UploadId']
        event[Key.FINISHED] = False
    else:
        s3_blobstore.copy(source_bucket, source_key, destination_bucket,
                          destination_key)
        event[_Key.UPLOAD_ID] = None
        event[Key.FINISHED] = True
    event[Key.CONTENT_TYPE] = blobinfo['ContentType']
    event[_Key.SOURCE_ETAG] = source_etag
    event[_Key.SIZE] = source_size
    event[_Key.PART_SIZE] = part_size
    event[_Key.PART_COUNT] = part_count

    # clear out any previous error state
    AsyncStateItem.delete(_error_key(event))

    return event
Beispiel #13
0
def upload_to_cloud(file_handles, staging_bucket, replica, from_cloud=False, log_progress=False):
    """
    Upload files to cloud.

    :param file_handles: If from_cloud, file_handles is a aws s3 directory path to files with appropriate
                         metadata uploaded. Else, a list of binary file_handles to upload.
    :param staging_bucket: The aws bucket to upload the files to.
    :param replica: The cloud replica to write to. One of 'aws', 'gc', or 'azure'. No functionality now.
    :param bool log_progress: set to True to log progress to stdout. Progress bar will reflect bytes
                              uploaded (and not files uploaded). This is off by default,
                              as direct calls to this function are assumed to be programmatic.
                              In addition, even if this is set to True, a progress bar will not
                              be shown if (a) the logging level is not INFO or lower or (b) an
                              interactive session is not detected.
    :return: a list of file uuids, key-names, and absolute file paths (local) for uploaded files
    """
    s3 = boto3.resource("s3")
    file_uuids = []
    key_names = []
    abs_file_paths = []
    log_progress = all((logger.getEffectiveLevel() <= logging.INFO, sys.stdout.isatty(), log_progress))

    if from_cloud:
        file_uuids, key_names = _copy_from_s3(file_handles[0], s3)
    else:
        destination_bucket = s3.Bucket(staging_bucket)
        if log_progress:
            total_upload_size = sum(os.fstat(f.fileno()).st_size for f in file_handles)
            logger.addHandler(ProgressBarStreamHandler())
            progress = tqdm.tqdm(total=total_upload_size, desc="Uploading to " + replica,
                                 unit="B", unit_scale=True, unit_divisor=1024)
        for raw_fh in file_handles:
            file_size = os.path.getsize(raw_fh.name)
            multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(file_size)
            tx_cfg = TransferConfig(multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
                                    multipart_chunksize=multipart_chunksize)
            with ChecksummingBufferedReader(raw_fh, multipart_chunksize) as fh:
                file_uuid = str(uuid.uuid4())
                key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name))
                destination_bucket.upload_fileobj(
                    fh,
                    key_name,
                    Config=tx_cfg,
                    Callback=lambda x: progress.update(x) if log_progress else None,
                    ExtraArgs={
                        'ContentType': _mime_type(fh.raw.name),
                    }
                )
                sums = fh.get_checksums()
                metadata = {
                    "hca-dss-s3_etag": sums["s3_etag"],
                    "hca-dss-sha1": sums["sha1"],
                    "hca-dss-sha256": sums["sha256"],
                    "hca-dss-crc32c": sums["crc32c"],
                }
                s3.meta.client.put_object_tagging(Bucket=destination_bucket.name,
                                                  Key=key_name,
                                                  Tagging=dict(TagSet=encode_tags(metadata)))
                file_uuids.append(file_uuid)
                key_names.append(key_name)
                abs_file_paths.append(fh.raw.name)
        if log_progress:
            logger.handlers = [l for l in logger.handlers if not isinstance(l, ProgressBarStreamHandler)]
            progress.close()
    return file_uuids, key_names, abs_file_paths
Beispiel #14
0
def get_part_size(object_size, dest_replica):
    if dest_replica.storage_schema == "s3":
        return get_s3_multipart_chunk_size(object_size)
    else:
        return part_size["gs"]
 def _transfer_config(self) -> TransferConfig:
     multipart_chunksize = get_s3_multipart_chunk_size(
         self._s3obj.content_length)
     return TransferConfig(multipart_threshold=MULTIPART_THRESHOLD,
                           multipart_chunksize=multipart_chunksize)
class TestChecksummingSink(unittest.TestCase):
    file_size = os.path.getsize(TEST_FILE)
    chunk_size = s3_multipart.get_s3_multipart_chunk_size(file_size)

    def check_sums(self, checksums):
        [
            self.assertEqual(checksums[hash_function].lower(),
                             TEST_FILE_CHECKSUMS[hash_function].lower())
            for hash_function in checksums.keys()
        ]

    def test_crc32c_calculation(self):
        crc32 = CRC32C()

        with open(TEST_FILE, 'rb') as fh:
            data = fh.read()
            crc32.update(data)
            checksum = crc32.hexdigest()

        self.assertEqual(checksum.lower(),
                         TEST_FILE_CHECKSUMS['crc32c'].lower())

    def test_crc32_calculation_empty_data_is_zero_padded(self):
        crc32 = CRC32C()

        crc32.update(b"")
        checksum = crc32.hexdigest()

        self.assertEquals(checksum.lower(), "00000000")

    def test_checksums_after_single_write(self):
        sink = ChecksummingSink(self.chunk_size)
        with open(TEST_FILE, 'rb') as fh:
            data = fh.read()
            sink.write(data)
        sums = sink.get_checksums()
        self.check_sums(sums)

    def test_checksums_after_multiple_write(self):
        sink = ChecksummingSink(self.chunk_size)
        with open(TEST_FILE, 'rb') as fh:
            while True:
                data = fh.read(self.chunk_size)
                if not data:
                    break
                sink.write(data)
        sums = sink.get_checksums()
        self.check_sums(sums)

    def test_hash_function_list_is_configurable(self):
        checksums_to_compute = ['sha1', 's3_etag']
        sink = ChecksummingSink(self.chunk_size,
                                hash_functions=checksums_to_compute)
        with open(TEST_FILE, 'rb') as fh:
            data = fh.read()
            sink.write(data)
        sums = sink.get_checksums()
        self.assertEqual(list(sorted(sums.keys())),
                         sorted(checksums_to_compute))
        [
            self.assertEqual(TEST_FILE_CHECKSUMS[checksum].lower(),
                             sums[checksum].lower())
            for checksum in checksums_to_compute
        ]
Beispiel #17
0
 def transfer_config(cls, file_size):
     return TransferConfig(multipart_threshold=s3_multipart.MULTIPART_THRESHOLD,
                           multipart_chunksize=s3_multipart.get_s3_multipart_chunk_size(file_size))