def checksum_and_upload_file( self, local_path: str, remote_path: str, content_type: str = "application/octet-stream", metadata: typing.Dict[str, str] = None, *args, **kwargs) -> None: if metadata is None: metadata = dict() fpath = os.path.join(self.local_root, local_path) size = os.path.getsize(fpath) chunk_size = get_s3_multipart_chunk_size(size) with ChecksummingSink(write_chunk_size=chunk_size) as sink, open( fpath, "rb") as fh: data = fh.read() sink.write(data) sums = sink.get_checksums() metadata['hca-dss-crc32c'] = sums['crc32c'].lower() metadata['hca-dss-s3_etag'] = sums['s3_etag'].lower() metadata['hca-dss-sha1'] = sums['sha1'].lower() metadata['hca-dss-sha256'] = sums['sha256'].lower() self.upload_file(local_path, remote_path, content_type, metadata, *args, **kwargs) # noqa
class TestChecksummingBufferedReader(unittest.TestCase): file_size = os.path.getsize(TEST_FILE) chunk_size = s3_multipart.get_s3_multipart_chunk_size(file_size) def check_sums(self, checksums): self.assertEqual(checksums['sha1'], TEST_FILE_CHECKSUMS['sha1']) self.assertEqual(checksums['sha256'], TEST_FILE_CHECKSUMS['sha256']) self.assertEqual(checksums['crc32c'].lower(), TEST_FILE_CHECKSUMS['crc32c'].lower()) self.assertEqual(checksums['s3_etag'], TEST_FILE_CHECKSUMS['s3_etag']) def test_checksums_after_single_read(self): with io.open(TEST_FILE, 'rb') as fh: reader = ChecksummingBufferedReader(fh, self.chunk_size) reader.read() sums = reader.get_checksums() self.check_sums(sums) def test_checksums_after_multiple_reads(self): with io.open(TEST_FILE, 'rb') as raw_fh: reader = ChecksummingBufferedReader(raw_fh, self.chunk_size) while True: buf = reader.read(self.chunk_size) if not buf: break sums = reader.get_checksums() self.check_sums(sums)
def _upload_local_file_to_staging(self, path: str, file_uuid: str, content_type): """ Upload a local file to the staging bucket, computing the DSS-required checksums in the process, then tag the file in the staging bucket with the checksums. This is in preparation from subsequently uploading the file from the staging bucket into the DSS. :param path: Path to a local file. :param file_uuid: An RFC4122-compliant UUID to be used to identify the file. :param content_type: Content description, for example: "application/json; dss-type=fileref". :return: file_uuid: str, key_name: str """ def _encode_tags(tags): return [dict(Key=k, Value=v) for k, v in tags.items()] def _mime_type(filename): type_, encoding = mimetypes.guess_type(filename) if encoding: return encoding if type_: return type_ return "application/octet-stream" file_size = os.path.getsize(path) multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size( file_size) tx_cfg = TransferConfig( multipart_threshold=s3_multipart.MULTIPART_THRESHOLD, multipart_chunksize=multipart_chunksize) s3 = boto3.resource("s3") destination_bucket = s3.Bucket(self.staging_bucket) with open(path, "rb") as file_handle, ChecksummingBufferedReader( file_handle, multipart_chunksize) as fh: key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name)) destination_bucket.upload_fileobj( fh, key_name, Config=tx_cfg, ExtraArgs={ 'ContentType': content_type if content_type is not None else _mime_type(fh.raw.name) }) sums = fh.get_checksums() metadata = { "hca-dss-s3_etag": sums["s3_etag"], "hca-dss-sha1": sums["sha1"], "hca-dss-sha256": sums["sha256"], "hca-dss-crc32c": sums["crc32c"], } s3.meta.client.put_object_tagging( Bucket=destination_bucket.name, Key=key_name, Tagging=dict(TagSet=_encode_tags(metadata))) return file_uuid, key_name
def upload_to_cloud(file_handles, staging_bucket, replica, from_cloud=False): """ Upload files to cloud. :param file_handles: If from_cloud, file_handles is a aws s3 directory path to files with appropriate metadata uploaded. Else, a list of binary file_handles to upload. :param staging_bucket: The aws bucket to upload the files to. :param replica: The cloud replica to write to. One of 'aws', 'gc', or 'azure'. No functionality now. :return: a list of each file's unique key name. """ s3 = boto3.resource("s3") file_uuids = [] key_names = [] if from_cloud: file_uuids, key_names = _copy_from_s3(file_handles[0], s3) else: destination_bucket = s3.Bucket(staging_bucket) for raw_fh in file_handles: file_size = os.path.getsize(raw_fh.name) multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size( file_size) tx_cfg = TransferConfig( multipart_threshold=s3_multipart.MULTIPART_THRESHOLD, multipart_chunksize=multipart_chunksize) with ChecksummingBufferedReader(raw_fh, multipart_chunksize) as fh: file_uuid = str(uuid.uuid4()) key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name)) destination_bucket.upload_fileobj(fh, key_name, Config=tx_cfg, ExtraArgs={ 'ContentType': _mime_type(fh.raw.name), }) sums = fh.get_checksums() metadata = { "hca-dss-s3_etag": sums["s3_etag"], "hca-dss-sha1": sums["sha1"], "hca-dss-sha256": sums["sha256"], "hca-dss-crc32c": sums["crc32c"], } s3.meta.client.put_object_tagging( Bucket=destination_bucket.name, Key=key_name, Tagging=dict(TagSet=encode_tags(metadata))) file_uuids.append(file_uuid) key_names.append(key_name) return file_uuids, key_names
def _compute_checksums(self, progress_callback=None): multipart_chunksize = get_s3_multipart_chunk_size( self._s3obj.content_length) with ChecksummingSink(multipart_chunksize) as sink: self._s3client.download_fileobj(self._s3obj.bucket_name, self._s3obj.key, sink, Callback=progress_callback, Config=self._transfer_config()) checksums = sink.get_checksums() if len(DssChecksums.CHECKSUM_NAMES) != len(checksums): error = f"checksums {checksums} for {self._s3obj.key} do not meet requirements" raise UploadException(status=500, title=error, detail=str(checksums)) return checksums
def compute(self): """ Compute the checksum(s) for the given file and return a map of the value by the hash function name. """ start_time = time.time() _file_size = os.path.getsize(self._filename) _multipart_chunksize = get_s3_multipart_chunk_size(_file_size) with ChecksummingSink(_multipart_chunksize, hash_functions=self._checksums) as sink: with open(self._filename, 'rb') as _file_object: data = _file_object.read(_multipart_chunksize) while data: sink.write(data) data = _file_object.read(_multipart_chunksize) checksums = sink.get_checksums() print("Checksumming took %.2f milliseconds to compute" % ((time.time() - start_time) * 1000)) return checksums
def upload_file(app, contents, replica): src_key = generate_test_key() encoded = json.dumps(contents).encode() chunk_size = get_s3_multipart_chunk_size(len(encoded)) with io.BytesIO(encoded) as fh, ChecksummingSink( write_chunk_size=chunk_size) as sink: sink.write(fh.read()) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower() } fh.seek(0) if replica == 'gcp': gs_test_bucket = get_env("DSS_GS_BUCKET_TEST") gcp_client = gs_storage.Client.from_service_account_json( os.getenv("GOOGLE_APPLICATION_CREDENTIALS")) gs_bucket = gcp_client.bucket(gs_test_bucket) blob = gs_bucket.blob(src_key) blob.upload_from_file(fh, content_type="application/json") blob.metadata = metadata blob.patch() source_url = f"gs://{gs_test_bucket}/{src_key}" if replica == 'aws': # TODO: consider switching to unmanaged uploader (putobject w/blob) s3_test_bucket = get_env("DSS_S3_BUCKET_TEST") s3 = boto3.resource('s3') s3.Bucket(s3_test_bucket).Object(src_key).upload_fileobj( fh, ExtraArgs={"Metadata": metadata}) source_url = f"s3://{s3_test_bucket}/{src_key}" file_uuid = str(uuid4()) version = datetime_to_version_format(datetime.utcnow()) urlbuilder = UrlBuilder().set(path='/v1/files/' + file_uuid) urlbuilder.add_query("version", version) resp_obj = app.put(str(urlbuilder), json=dict(creator_uid=0, source_url=source_url), headers=get_auth_header()) resp_obj.raise_for_status() return file_uuid, resp_obj.json()["version"]
def upload_file(self, local_path: str, remote_path: str, content_type: str = "application/octet-stream", metadata_keys: typing.Dict[str, str] = None, tags: typing.Dict[str, str] = None, s3_part_size: int = None, *args, **kwargs) -> None: if metadata_keys is None: metadata_keys = dict() if tags is None: tags = dict() fp = os.path.join(self.local_root, local_path) sz = os.stat(fp).st_size if s3_part_size is None: s3_part_size = get_s3_multipart_chunk_size(sz) transfer_config = TransferConfig( multipart_threshold=MULTIPART_THRESHOLD, multipart_chunksize=s3_part_size, ) logger.info( f"Uploading {local_path} to s3://{self.bucket}/{remote_path}") self.s3_client.upload_file( fp, self.bucket, remote_path, ExtraArgs={ "Metadata": metadata_keys, "ContentType": content_type, }, Config=transfer_config, ) tagset = dict(TagSet=[]) # type: typing.Dict[str, typing.List[dict]] for tag_key, tag_value in tags.items(): tagset['TagSet'].append(dict(Key=tag_key, Value=tag_value)) self.s3_client.put_object_tagging(Bucket=self.bucket, Key=remote_path, Tagging=tagset)
def _stage_file(self, key, size=7): assert size < MULTIPART_THRESHOLD data = os.urandom(size) chunk_size = get_s3_multipart_chunk_size(size) with ChecksummingSink(write_chunk_size=chunk_size) as sink: sink.write(data) sums = sink.get_checksums() metadata = { 'hca-dss-crc32c': sums['crc32c'].lower(), 'hca-dss-s3_etag': sums['s3_etag'].lower(), 'hca-dss-sha1': sums['sha1'].lower(), 'hca-dss-sha256': sums['sha256'].lower(), } fh = io.BytesIO(data) blob = self.s3.Bucket(os.environ['DSS_S3_BUCKET_TEST']).Object(key) blob.upload_fileobj(fh, ExtraArgs=dict( Metadata=metadata, ContentType="application/octet-stream")) return f"s3://{os.environ['DSS_S3_BUCKET_TEST']}/{key}"
def _checksum_and_stage_file(self, file_handle: typing.BinaryIO, size: int, content_type: str = "application/octet-stream"): key = f"staging/{uuid4()}" chunk_size = get_s3_multipart_chunk_size(size) with ChecksummingSink(write_chunk_size=chunk_size) as sink: data = file_handle.read() sink.write(data) sums = sink.get_checksums() metadata = dict() metadata['hca-dss-crc32c'] = sums['crc32c'].lower() metadata['hca-dss-s3_etag'] = sums['s3_etag'].lower() metadata['hca-dss-sha1'] = sums['sha1'].lower() metadata['hca-dss-sha256'] = sums['sha256'].lower() with io.BytesIO(data) as fh: self.handle.upload_file_handle(self.staging_bucket, key, fh, content_type, metadata) return f"gs://{self.staging_bucket}/{key}"
def compute(self): """ Compute the checksum(s) for the given file and return a map of the value by the hash function name. """ start_time = time.time() if self._data: with ChecksummingSink(self._data_size, hash_functions=self._checksums) as sink: sink.write(self._data) checksums = sink.get_checksums() elif self._filename: _multipart_chunksize = get_s3_multipart_chunk_size( self._data_size) with ChecksummingSink(_multipart_chunksize, hash_functions=self._checksums) as sink: with open(self._filename, 'rb') as _file_object: while True: data = _file_object.read(_multipart_chunksize) if not data: break sink.write(data) checksums = sink.get_checksums() logger.info("Checksumming took %.2f milliseconds to compute" % ((time.time() - start_time) * 1000)) return checksums
def setup_copy_task(event, lambda_context): source_bucket = event[Key.SOURCE_BUCKET] source_key = event[Key.SOURCE_KEY] destination_bucket = event[Key.DESTINATION_BUCKET] destination_key = event[Key.DESTINATION_KEY] s3_blobstore = S3BlobStore.from_environment() blobinfo = s3_blobstore.get_all_metadata(source_bucket, source_key) source_etag = blobinfo['ETag'].strip( "\"") # the ETag is returned with an extra set of quotes. source_size = blobinfo['ContentLength'] # type: int part_size = get_s3_multipart_chunk_size(source_size) part_count = source_size // part_size if part_count * part_size < source_size: part_count += 1 if part_count > 1: mpu = s3_blobstore.s3_client.create_multipart_upload( Bucket=destination_bucket, Key=destination_key, ContentType=blobinfo['ContentType']) event[_Key.UPLOAD_ID] = mpu['UploadId'] event[Key.FINISHED] = False else: s3_blobstore.copy(source_bucket, source_key, destination_bucket, destination_key) event[_Key.UPLOAD_ID] = None event[Key.FINISHED] = True event[Key.CONTENT_TYPE] = blobinfo['ContentType'] event[_Key.SOURCE_ETAG] = source_etag event[_Key.SIZE] = source_size event[_Key.PART_SIZE] = part_size event[_Key.PART_COUNT] = part_count # clear out any previous error state AsyncStateItem.delete(_error_key(event)) return event
def upload_to_cloud(file_handles, staging_bucket, replica, from_cloud=False, log_progress=False): """ Upload files to cloud. :param file_handles: If from_cloud, file_handles is a aws s3 directory path to files with appropriate metadata uploaded. Else, a list of binary file_handles to upload. :param staging_bucket: The aws bucket to upload the files to. :param replica: The cloud replica to write to. One of 'aws', 'gc', or 'azure'. No functionality now. :param bool log_progress: set to True to log progress to stdout. Progress bar will reflect bytes uploaded (and not files uploaded). This is off by default, as direct calls to this function are assumed to be programmatic. In addition, even if this is set to True, a progress bar will not be shown if (a) the logging level is not INFO or lower or (b) an interactive session is not detected. :return: a list of file uuids, key-names, and absolute file paths (local) for uploaded files """ s3 = boto3.resource("s3") file_uuids = [] key_names = [] abs_file_paths = [] log_progress = all((logger.getEffectiveLevel() <= logging.INFO, sys.stdout.isatty(), log_progress)) if from_cloud: file_uuids, key_names = _copy_from_s3(file_handles[0], s3) else: destination_bucket = s3.Bucket(staging_bucket) if log_progress: total_upload_size = sum(os.fstat(f.fileno()).st_size for f in file_handles) logger.addHandler(ProgressBarStreamHandler()) progress = tqdm.tqdm(total=total_upload_size, desc="Uploading to " + replica, unit="B", unit_scale=True, unit_divisor=1024) for raw_fh in file_handles: file_size = os.path.getsize(raw_fh.name) multipart_chunksize = s3_multipart.get_s3_multipart_chunk_size(file_size) tx_cfg = TransferConfig(multipart_threshold=s3_multipart.MULTIPART_THRESHOLD, multipart_chunksize=multipart_chunksize) with ChecksummingBufferedReader(raw_fh, multipart_chunksize) as fh: file_uuid = str(uuid.uuid4()) key_name = "{}/{}".format(file_uuid, os.path.basename(fh.raw.name)) destination_bucket.upload_fileobj( fh, key_name, Config=tx_cfg, Callback=lambda x: progress.update(x) if log_progress else None, ExtraArgs={ 'ContentType': _mime_type(fh.raw.name), } ) sums = fh.get_checksums() metadata = { "hca-dss-s3_etag": sums["s3_etag"], "hca-dss-sha1": sums["sha1"], "hca-dss-sha256": sums["sha256"], "hca-dss-crc32c": sums["crc32c"], } s3.meta.client.put_object_tagging(Bucket=destination_bucket.name, Key=key_name, Tagging=dict(TagSet=encode_tags(metadata))) file_uuids.append(file_uuid) key_names.append(key_name) abs_file_paths.append(fh.raw.name) if log_progress: logger.handlers = [l for l in logger.handlers if not isinstance(l, ProgressBarStreamHandler)] progress.close() return file_uuids, key_names, abs_file_paths
def get_part_size(object_size, dest_replica): if dest_replica.storage_schema == "s3": return get_s3_multipart_chunk_size(object_size) else: return part_size["gs"]
def _transfer_config(self) -> TransferConfig: multipart_chunksize = get_s3_multipart_chunk_size( self._s3obj.content_length) return TransferConfig(multipart_threshold=MULTIPART_THRESHOLD, multipart_chunksize=multipart_chunksize)
class TestChecksummingSink(unittest.TestCase): file_size = os.path.getsize(TEST_FILE) chunk_size = s3_multipart.get_s3_multipart_chunk_size(file_size) def check_sums(self, checksums): [ self.assertEqual(checksums[hash_function].lower(), TEST_FILE_CHECKSUMS[hash_function].lower()) for hash_function in checksums.keys() ] def test_crc32c_calculation(self): crc32 = CRC32C() with open(TEST_FILE, 'rb') as fh: data = fh.read() crc32.update(data) checksum = crc32.hexdigest() self.assertEqual(checksum.lower(), TEST_FILE_CHECKSUMS['crc32c'].lower()) def test_crc32_calculation_empty_data_is_zero_padded(self): crc32 = CRC32C() crc32.update(b"") checksum = crc32.hexdigest() self.assertEquals(checksum.lower(), "00000000") def test_checksums_after_single_write(self): sink = ChecksummingSink(self.chunk_size) with open(TEST_FILE, 'rb') as fh: data = fh.read() sink.write(data) sums = sink.get_checksums() self.check_sums(sums) def test_checksums_after_multiple_write(self): sink = ChecksummingSink(self.chunk_size) with open(TEST_FILE, 'rb') as fh: while True: data = fh.read(self.chunk_size) if not data: break sink.write(data) sums = sink.get_checksums() self.check_sums(sums) def test_hash_function_list_is_configurable(self): checksums_to_compute = ['sha1', 's3_etag'] sink = ChecksummingSink(self.chunk_size, hash_functions=checksums_to_compute) with open(TEST_FILE, 'rb') as fh: data = fh.read() sink.write(data) sums = sink.get_checksums() self.assertEqual(list(sorted(sums.keys())), sorted(checksums_to_compute)) [ self.assertEqual(TEST_FILE_CHECKSUMS[checksum].lower(), sums[checksum].lower()) for checksum in checksums_to_compute ]
def transfer_config(cls, file_size): return TransferConfig(multipart_threshold=s3_multipart.MULTIPART_THRESHOLD, multipart_chunksize=s3_multipart.get_s3_multipart_chunk_size(file_size))