def drs_extract_tar_gz(args: argparse.Namespace): """ Extract a `tar.gz` archive resolved by DRS into a Google Storage bucket. example: tnu drs extract-tar-gz drs://my-tar-gz gs://my-dst-bucket/root """ args.workspace, args.workspace_namespace = CLIConfig.resolve(args.workspace, args.workspace_namespace) drs.extract_tar_gz(args.drs_url, args.dst, args.workspace, args.workspace_namespace)
def drs_extract_tar_gz(args: argparse.Namespace): """ Extract a `tar.gz` archive resolved by DRS into a Google Storage bucket. example: tnu drs extract-tar-gz drs://my-tar-gz gs://my-dst-bucket/root """ assert args.dst_gs_url.startswith("gs://") bucket, pfx = args.dst_gs_url[5:].split("/", 1) pfx = pfx or None args.workspace, args.workspace_namespace = Config.resolve( args.workspace, args.workspace_namespace) drs.extract_tar_gz(args.drs_url, pfx, bucket, args.workspace, args.workspace_namespace)
def test_arg_propagation(self): resp_json = mock.MagicMock( return_value={ 'googleServiceAccount': { 'data': { 'project_id': "foo" } }, 'dos': { 'data_object': { 'urls': [{ 'url': 'gs://asdf/asdf' }] } } }) requests_post = mock.MagicMock( return_value=mock.MagicMock(status_code=200, json=resp_json)) with ExitStack() as es: es.enter_context( mock.patch("terra_notebook_utils.drs.gs.get_client")) es.enter_context(mock.patch("terra_notebook_utils.drs.tar_gz")) es.enter_context( mock.patch( "terra_notebook_utils.blobstore.gs.GSBlob.download")) es.enter_context( mock.patch("terra_notebook_utils.drs.DRSCopyClient")) es.enter_context( mock.patch("terra_notebook_utils.drs.GSBlob.open")) es.enter_context( mock.patch("terra_notebook_utils.drs.http", post=requests_post)) with mock.patch("terra_notebook_utils.drs.enable_requester_pays" ) as enable_requester_pays: with self.subTest("Copy to local"): with tempfile.NamedTemporaryFile() as tf: drs.copy(self.drs_url, tf.name) enable_requester_pays.assert_called_with( WORKSPACE_NAME, WORKSPACE_NAMESPACE) with self.subTest("Copy to bucket"): enable_requester_pays.reset_mock() drs.copy(self.drs_url, "gs://some_bucket/some_key") enable_requester_pays.assert_called_with( WORKSPACE_NAME, WORKSPACE_NAMESPACE) with self.subTest("Extract tarball"): enable_requester_pays.reset_mock() drs.extract_tar_gz(self.drs_url) enable_requester_pays.assert_called_with( WORKSPACE_NAME, WORKSPACE_NAMESPACE)
def test_extract_tar_gz(self): expected_data = ( b"\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00BC\x02\x00\x90 \xed]kO\\I\x92\xfd\xcc\xfc" b"\x8a\xd2\xb4V\xfb2\xd7\xf9~\xac\x97\x910\xb6i$\x1b\xbb\r\xdb=\xd3_\x10\x862\xae\x1d\x0c" b"\x0cU\xb8\xa7G\xfe\xf1{\xe2\xc6\xc9\xa2\xc0\xb8\xdb\xdd\xf2,_\xd2R\xc8\x87" ) # This test uses a hack property, `_extract_single_chunk`, to extract a small amount # of data from the cohort vcf pointed to by `drs://dg.4503/da8cb525-4532-4d0f-90a3-4d327817ec73`. with mock.patch("terra_notebook_utils.tar_gz._extract_single_chunk", True): drs_url = "drs://dg.4503/da8cb525-4532-4d0f-90a3-4d327817ec73" # cohort VCF tarball pfx = f"test_cohort_extract_{uuid4()}" drs.extract_tar_gz(drs_url, pfx) for key in gs.list_bucket(pfx): blob = gs.get_client().bucket(WORKSPACE_BUCKET).get_blob(key) data = blob.download_as_bytes()[:len(expected_data)] self.assertEqual(data, expected_data)
with herzog.Cell("python"): """Test drs copy batch""" manifest = [ dict(drs_uri=DRS_URI_370_KB, dst=f"gs://{bucket_name}/test-notebook-{uuid4()}"), dict(drs_uri=DRS_URI_370_KB, dst=f"."), dict(drs_uri=DRS_URI_240_MB, dst=f"gs://{bucket_name}/test-notebook-{uuid4()}"), dict(drs_uri=DRS_URI_240_MB, dst=f"."), ] drs.copy_batch(manifest) with herzog.Cell("python"): """Test drs extract tarball""" drs.extract_tar_gz(DRS_URI_TAR_GZ, ".") drs.extract_tar_gz(DRS_URI_TAR_GZ, f"gs://{bucket_name}/test-notebook-{uuid4()}") with herzog.Cell("python"): """Test vcf info""" blob = drs.blob_for_url(DRS_URI_100_GB, os.environ['GOOGLE_PROJECT']) info = vcf.VCFInfo.with_blob(blob) assert 2504 == len(info.samples) assert "HG00096" == info.samples[0] with herzog.Cell("python"): """Test workspace get""" workspace.get_workspace("terra-notebook-utils-tests") with herzog.Cell("python"):
The TOPMed genomic data that you import from Gen3 is controlled access and imported into Terra as a Data Repository Service (DRS) URL to the controlled access bucket that holds the file. The code below allows you to share your credentials and download the file to your workspace so that you can interact with the file in a notebook. See which files are available in the Reference File data table """ with herzog.Cell("python"): data_table = "reference_file" table.print_column(data_table, "pfb:file_name") with herzog.Cell("markdown"): """ Select which VCF you would like to use in your analysis from the printed list above. """ with herzog.Cell("python"): # Get a drs url from our workspace data table (make sure to put in a file name!) file_name = "YOUR_FILE_NAME_.tar.gz" # If this next step throws a key error, make sure you are not on a Spark cluster # See notes in the "set up your notebook" heading above drs_url = table.fetch_drs_url(data_table, file_name) print(drs_url) # Copy object into our workspace bucket drs.copy(drs_url, file_name) with herzog.Cell("python"): # Extract .tar.gz to our workspace bucket drs.extract_tar_gz(drs_url, file_name) with herzog.Cell("python"): elapsed_notebook_time = time.time() - start_notebook_time print(timedelta(seconds=elapsed_notebook_time))