Esempio n. 1
0
 def with_blob(cls, blob, read_buf: Optional[memoryview] = None):
     chunk_size = 1024 * 1024
     with ThreadPoolExecutor(max_workers=IO_CONCURRENCY) as e:
         async_queue = AsyncQueue(e, IO_CONCURRENCY)
         try:
             with gscio.Reader(blob, chunk_size, async_queue) as raw:
                 return cls.with_bgzip_fileobj(raw, read_buf, chunk_size)
         except bgzip.BGZIPException:
             with gscio.Reader(blob, chunk_size, async_queue) as raw:
                 return cls.with_gzip_fileobj(raw)
def _get_fileobj(uri: str):
    if uri.startswith("gs://"):
        bucket_name, key = uri[5:].split("/", 1)
        blob = gs.get_client().bucket(
            bucket_name, user_project=WORKSPACE_GOOGLE_PROJECT).get_blob(key)
        fh = gscio.Reader(blob, chunk_size=1024**2)
    elif uri.startswith("drs://"):
        gs_client, drs_info = drs.resolve_drs_for_gs_storage(uri)
        bucket = gs_client.bucket(drs_info.bucket_name,
                                  user_project=WORKSPACE_GOOGLE_PROJECT)
        fh = gscio.Reader(bucket.get_blob(drs_info.key), chunk_size=1024**2)
    else:
        fh = open(uri, "rb")
    return fh
Esempio n. 3
0
def head(drs_url: str,
         num_bytes: int = 1,
         buffer: int = MULTIPART_THRESHOLD,
         workspace_name: Optional[str] = WORKSPACE_NAME,
         workspace_namespace: Optional[str] = WORKSPACE_GOOGLE_PROJECT):
    """
    Head a DRS object by byte.

    :param drs_url: A drs:// schema URL.
    :param num_bytes: Number of bytes to print from the DRS object.
    :param workspace_name: The name of the terra workspace.
    :param workspace_namespace: The name of the terra workspace namespace.
    """
    assert drs_url.startswith("drs://"), f'Not a DRS schema: {drs_url}'
    enable_requester_pays(workspace_name, workspace_namespace)
    try:
        client, info = resolve_drs_for_gs_storage(drs_url)
        blob = client.bucket(info.bucket_name, user_project=workspace_namespace).blob(info.key)
        with gscio.Reader(blob, chunk_size=buffer) as handle:
            the_bytes = handle.read(num_bytes)

    except (DRSResolutionError, NotFound, Forbidden):
        raise GSBlobInaccessible(f'The DRS URL: {drs_url}\n'
                                 f'Could not be accessed because of:\n'
                                 f'{traceback.format_exc()}')
    return the_bytes
Esempio n. 4
0
 def test_fetch_chunk(self):
     blob = mock.MagicMock()
     blob.size = 1.1 * default_chunk_size
     blob.download_as_bytes = mock.MagicMock()
     reader = gscio.Reader(blob)
     with self.assertRaises(ValueError):
         reader._fetch_chunk(1)
     self.assertEqual(reader_retries, blob.download_as_bytes.call_count)
Esempio n. 5
0
 def test_readinto(self):
     for blob, expected_data in self.blob_tests:
         buff = bytearray(2 * len(expected_data) or 1)
         chunk_size = len(expected_data) // 3 or 1
         for test_name, threads, async_queue, async_set in self.duration_subtests(
         ):
             with gscio.Reader(blob,
                               chunk_size=chunk_size,
                               async_queue=async_queue) as fh:
                 bytes_read = fh.readinto(buff)
                 self.assertEqual(expected_data, buff[:bytes_read])
Esempio n. 6
0
 def test_read(self):
     for blob, expected_data in self.blob_tests:
         if expected_data:
             chunk_size = len(expected_data) // 3
             expected_number_of_chunks = 4
         else:
             chunk_size = 1
             expected_number_of_chunks = 1
         for test_name, threads, async_queue, async_set in self.duration_subtests(
         ):
             with gscio.Reader(blob,
                               chunk_size=chunk_size,
                               async_queue=async_queue) as fh:
                 self.assertEqual(expected_number_of_chunks,
                                  fh.number_of_chunks)
                 self.assertEqual(expected_data, fh.read())
Esempio n. 7
0
def extract_tar_gz(drs_url: str,
                   dst_pfx: str=None,
                   dst_bucket_name: str=None,
                   workspace_name: Optional[str]=WORKSPACE_NAME,
                   workspace_namespace: Optional[str]=WORKSPACE_GOOGLE_PROJECT):
    """
    Extract a `.tar.gz` archive resolved by a DRS url into a Google Storage bucket.
    """
    if dst_bucket_name is None:
        dst_bucket_name = WORKSPACE_BUCKET
    enable_requester_pays(workspace_name, workspace_namespace)
    src_client, src_info = resolve_drs_for_gs_storage(drs_url)
    src_bucket = src_client.bucket(src_info.bucket_name, user_project=workspace_namespace)
    dst_bucket = gs.get_client().bucket(dst_bucket_name)
    with ThreadPoolExecutor(max_workers=IO_CONCURRENCY) as e:
        async_queue = async_collections.AsyncQueue(e, IO_CONCURRENCY)
        with gscio.Reader(src_bucket.get_blob(src_info.key), async_queue=async_queue) as fh:
            tar_gz.extract(fh, dst_bucket, root=dst_pfx)
Esempio n. 8
0
 def test_reader_interface(self):
     blob = mock.MagicMock()
     blob.size = 123
     reader = gscio.Reader(blob)
     with self.assertRaises(OSError):
         reader.fileno()
     with self.assertRaises(OSError):
         reader.write(b"nonsense")
     with self.assertRaises(OSError):
         reader.writelines(b"nonsense")
     with self.assertRaises(OSError):
         reader.seek(123)
     with self.assertRaises(NotImplementedError):
         reader.tell()
     with self.assertRaises(NotImplementedError):
         reader.truncate()
     self.assertTrue(reader.readable())
     self.assertFalse(reader.isatty())
     self.assertFalse(reader.seekable())
     self.assertFalse(reader.writable())
     self.assertFalse(reader.closed)
     reader.close()
     self.assertTrue(reader.closed)