def with_blob(cls, blob, read_buf: Optional[memoryview] = None): chunk_size = 1024 * 1024 with ThreadPoolExecutor(max_workers=IO_CONCURRENCY) as e: async_queue = AsyncQueue(e, IO_CONCURRENCY) try: with gscio.Reader(blob, chunk_size, async_queue) as raw: return cls.with_bgzip_fileobj(raw, read_buf, chunk_size) except bgzip.BGZIPException: with gscio.Reader(blob, chunk_size, async_queue) as raw: return cls.with_gzip_fileobj(raw)
def _get_fileobj(uri: str): if uri.startswith("gs://"): bucket_name, key = uri[5:].split("/", 1) blob = gs.get_client().bucket( bucket_name, user_project=WORKSPACE_GOOGLE_PROJECT).get_blob(key) fh = gscio.Reader(blob, chunk_size=1024**2) elif uri.startswith("drs://"): gs_client, drs_info = drs.resolve_drs_for_gs_storage(uri) bucket = gs_client.bucket(drs_info.bucket_name, user_project=WORKSPACE_GOOGLE_PROJECT) fh = gscio.Reader(bucket.get_blob(drs_info.key), chunk_size=1024**2) else: fh = open(uri, "rb") return fh
def head(drs_url: str, num_bytes: int = 1, buffer: int = MULTIPART_THRESHOLD, workspace_name: Optional[str] = WORKSPACE_NAME, workspace_namespace: Optional[str] = WORKSPACE_GOOGLE_PROJECT): """ Head a DRS object by byte. :param drs_url: A drs:// schema URL. :param num_bytes: Number of bytes to print from the DRS object. :param workspace_name: The name of the terra workspace. :param workspace_namespace: The name of the terra workspace namespace. """ assert drs_url.startswith("drs://"), f'Not a DRS schema: {drs_url}' enable_requester_pays(workspace_name, workspace_namespace) try: client, info = resolve_drs_for_gs_storage(drs_url) blob = client.bucket(info.bucket_name, user_project=workspace_namespace).blob(info.key) with gscio.Reader(blob, chunk_size=buffer) as handle: the_bytes = handle.read(num_bytes) except (DRSResolutionError, NotFound, Forbidden): raise GSBlobInaccessible(f'The DRS URL: {drs_url}\n' f'Could not be accessed because of:\n' f'{traceback.format_exc()}') return the_bytes
def test_fetch_chunk(self): blob = mock.MagicMock() blob.size = 1.1 * default_chunk_size blob.download_as_bytes = mock.MagicMock() reader = gscio.Reader(blob) with self.assertRaises(ValueError): reader._fetch_chunk(1) self.assertEqual(reader_retries, blob.download_as_bytes.call_count)
def test_readinto(self): for blob, expected_data in self.blob_tests: buff = bytearray(2 * len(expected_data) or 1) chunk_size = len(expected_data) // 3 or 1 for test_name, threads, async_queue, async_set in self.duration_subtests( ): with gscio.Reader(blob, chunk_size=chunk_size, async_queue=async_queue) as fh: bytes_read = fh.readinto(buff) self.assertEqual(expected_data, buff[:bytes_read])
def test_read(self): for blob, expected_data in self.blob_tests: if expected_data: chunk_size = len(expected_data) // 3 expected_number_of_chunks = 4 else: chunk_size = 1 expected_number_of_chunks = 1 for test_name, threads, async_queue, async_set in self.duration_subtests( ): with gscio.Reader(blob, chunk_size=chunk_size, async_queue=async_queue) as fh: self.assertEqual(expected_number_of_chunks, fh.number_of_chunks) self.assertEqual(expected_data, fh.read())
def extract_tar_gz(drs_url: str, dst_pfx: str=None, dst_bucket_name: str=None, workspace_name: Optional[str]=WORKSPACE_NAME, workspace_namespace: Optional[str]=WORKSPACE_GOOGLE_PROJECT): """ Extract a `.tar.gz` archive resolved by a DRS url into a Google Storage bucket. """ if dst_bucket_name is None: dst_bucket_name = WORKSPACE_BUCKET enable_requester_pays(workspace_name, workspace_namespace) src_client, src_info = resolve_drs_for_gs_storage(drs_url) src_bucket = src_client.bucket(src_info.bucket_name, user_project=workspace_namespace) dst_bucket = gs.get_client().bucket(dst_bucket_name) with ThreadPoolExecutor(max_workers=IO_CONCURRENCY) as e: async_queue = async_collections.AsyncQueue(e, IO_CONCURRENCY) with gscio.Reader(src_bucket.get_blob(src_info.key), async_queue=async_queue) as fh: tar_gz.extract(fh, dst_bucket, root=dst_pfx)
def test_reader_interface(self): blob = mock.MagicMock() blob.size = 123 reader = gscio.Reader(blob) with self.assertRaises(OSError): reader.fileno() with self.assertRaises(OSError): reader.write(b"nonsense") with self.assertRaises(OSError): reader.writelines(b"nonsense") with self.assertRaises(OSError): reader.seek(123) with self.assertRaises(NotImplementedError): reader.tell() with self.assertRaises(NotImplementedError): reader.truncate() self.assertTrue(reader.readable()) self.assertFalse(reader.isatty()) self.assertFalse(reader.seekable()) self.assertFalse(reader.writable()) self.assertFalse(reader.closed) reader.close() self.assertTrue(reader.closed)