def read_one_chunk(chunk_index):
        import gcsfs.mapping

        gcs = gcsfs.GCSFileSystem(gcs_project, token=gcs_token)
        store = gcsfs.mapping.GCSMap(gcs_path, gcs=gcs)
        adata = ad.read_zarr(store)
        return read_zarr_chunk(adata.X, chunk_size, chunk_index)
Example #2
0
 def read_adata(self, path):
     path_lc = path.lower()
     if path_lc.endswith('.loom'):
         return anndata.read_loom(path)
     elif path_lc.endswith('.zarr'):
         return anndata.read_zarr(path)
     elif path_lc.endswith('.tsv'):
         return read_star_fusion_file(path)
     elif path_lc.endswith('.rds'):  # Seurat, convert to h5ad
         h5_file = path + '.h5ad'
         import os
         if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001:
             import subprocess
             import pkg_resources
             import shutil
             print('Converting Seurat object')
             if os.path.exists(h5_file):
                 os.remove(h5_file)
             subprocess.check_call(
                 ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file])
             shutil.copystat(path, h5_file)
         adata = anndata.read(h5_file, backed=self.backed)
         if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
             print('Using adata.raw')
             adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns)
         return adata
     return anndata.read(path, backed=self.backed)
Example #3
0
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith('.loom'):
        adata = anndata.read_loom(path)
    elif path.lower().endswith('.zarr'):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if 'module' in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns['module']['X'], var=adata.uns['module']['var'])
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[
            0]:
        logger.info('Using adata.raw')
        adata = anndata.AnnData(X=adata.raw.X,
                                var=adata.raw.var,
                                obs=adata.obs,
                                obsm=adata.obsm,
                                uns=adata.uns)

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info(
                'No spatial data found in {}'.format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(
                adata.obs[field]):
            logger.info('Converting {} to categorical'.format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype('category')
    return adata
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith(".loom"):
        adata = anndata.read_loom(path)
    elif path.lower().endswith(".zarr"):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if "module" in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns["module"]["X"], var=adata.uns["module"]["var"]
        )
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
        logger.info("Using adata.raw")
        adata = anndata.AnnData(
            X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns
        )

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info("No spatial data found in {}".format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(adata.obs[field]):
            logger.info("Converting {} to categorical".format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype("category")
    return adata
Example #5
0
 def read_adata(self, filesystem, path):
     path_lc = path.lower()
     path_lc = path_lc.rstrip('/')
     if path_lc.endswith('.loom'):
         adata = anndata.read_loom(filesystem.open(path))
     elif path_lc.endswith('.zarr'):
         adata = anndata.read_zarr(filesystem.get_mapper(path))
     elif path_lc.endswith('.tsv'):
         adata = read_star_fusion_file(filesystem.open(path))
     elif path_lc.endswith('.rds'):  # Seurat, convert to h5ad
         h5_file = path + '.h5ad'
         import os
         if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001:
             import subprocess
             import pkg_resources
             import shutil
             print('Converting Seurat object')
             if os.path.exists(h5_file):
                 os.remove(h5_file)
             subprocess.check_call(
                 ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file])
             shutil.copystat(path, h5_file)
         adata = anndata.read_h5ad(h5_file, backed='r' if self.backed else None)
         if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
             print('Using adata.raw')
             adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns)
     else:
         adata = anndata.read_h5ad(filesystem.open(path), backed='r' if self.backed else None)
     if 'module' in adata.uns:
         adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(X=adata.uns['module']['X'],
                                                           var=adata.uns['module']['var'])
     return adata
Example #6
0
    def adata_dist(self, sc, request):
        # regular anndata except for X, which we replace on the next line
        a = ad.read_zarr(input_file)
        input_file_X = input_file + "/X"
        if request.param == "direct":
            a.X = zappy.direct.from_zarr(input_file_X)
            yield a
        elif request.param == "executor":
            with concurrent.futures.ThreadPoolExecutor(
                    max_workers=2) as executor:
                a.X = zappy.executor.from_zarr(executor, input_file_X)
                yield a
        elif request.param == "spark":
            a.X = zappy.spark.from_zarr(sc, input_file_X)
            yield a
        elif request.param == "dask":
            a.X = da.from_zarr(input_file_X)
            yield a
        elif request.param == "pywren":
            import s3fs.mapping

            s3 = s3fs.S3FileSystem()
            input_file_X = s3fs.mapping.S3Map(
                "sc-tom-test-data/10x-10k-subset.zarr/X", s3=s3)
            executor = zappy.executor.PywrenExecutor()
            a.X = zappy.executor.from_zarr(executor, input_file_X)
            yield a
Example #7
0
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.raw = adata_src
    assert not is_categorical_dtype(adata_src.obs["oanno1"])
    adata_src.write_zarr(tmp_path / "test_zarr_dir", chunks=True)

    adata = ad.read_zarr(tmp_path / "test_zarr_dir")
    assert is_categorical_dtype(adata.obs["oanno1"])
    assert not is_categorical_dtype(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert adata.obs["oanno1c"].cat.categories.tolist() == ["cat1"]
    assert is_categorical_dtype(adata.raw.var["vanno2"])
    pd.testing.assert_frame_equal(adata.obs, adata_src.obs)
    pd.testing.assert_frame_equal(adata.var, adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src)
Example #8
0
def test_backwards_compat_files(archive_dir):
    with pytest.warns(ad.OldFormatWarning):
        from_h5ad = ad.read_h5ad(archive_dir / "adata.h5ad")
    with pytest.warns(ad.OldFormatWarning):
        from_zarr = ad.read_zarr(archive_dir / "adata.zarr.zip")

    assert_equal(from_h5ad, from_zarr, exact=True)
Example #9
0
File: io.py Project: michalk8/wot
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords):
    """
    Read h5ad, loom, mtx, 10X h5, and csv formatted files

    Parameters
    ----------
    path: str
        File name of data file.
    obs: {str, pd.DataFrame}
        Path to obs data file or a data frame
    var: {str, pd.DataFrame}
        Path to var data file or a data frame
    obs_filter {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    var_filter: {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    Returns
    -------
    Annotated data matrix.
    """

    _, ext = os.path.splitext(str(path).lower())
    if ext == '.txt':
        df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0)
        adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    elif ext == '.h5ad':
        adata = anndata.read(path)
    elif ext == '.loom':
        adata = anndata.read_loom(path)
    elif ext == '.mtx':
        adata = anndata.read_mtx(path)
    elif ext == '.zarr':
        adata = anndata.read_zarr(path)
    else:
        raise ValueError('Unknown file format: {}'.format(ext))

    def get_df(meta):
        if not isinstance(meta, pd.DataFrame):
            tmp_path = None
            if meta.startswith('gs://'):
                tmp_path = download_gs_url(meta)
                meta = tmp_path
            meta = pd.read_csv(meta, sep=None, index_col='id', engine='python')
            if tmp_path is not None:
                os.remove(tmp_path)
        return meta

    if obs is not None:
        if not isinstance(obs, list) and not isinstance(obs, tuple):
            obs = [obs]
        for item in obs:
            adata.obs = adata.obs.join(get_df(item))
    if var is not None:
        if not isinstance(var, list) and not isinstance(var, tuple):
            var = [var]
        for item in var:
            adata.var = adata.var.join(get_df(item))

    return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
Example #10
0
 def test_write_zarr(self):
     log1p(self.adata_rdd)
     output_file_zarr = tmp_dir()
     self.adata_rdd.write_zarr(output_file_zarr, chunks=(2, 5))
     # read back as zarr (without using RDDs) and check it is the same as self.adata.X
     adata_log1p = ad.read_zarr(output_file_zarr)
     log1p(self.adata)
     self.assertTrue(np.array_equal(adata_log1p.X, self.adata.X))
 def from_zarr(cls, sc, zarr_file):
     """
     Read a Zarr file as an anndata object (for the metadata) and with the
     data matrix (X) as an RDD of numpy arrays.
     """
     adata = ad.read_zarr(zarr_file)
     chunk_size = zarr.open(zarr_file, mode='r')['X'].chunks
     return cls._from_anndata(sc, adata, chunk_size,
                              read_chunk_zarr(zarr_file, chunk_size))
Example #12
0
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert pd.api.types.is_string_dtype(adata.obs['oanno1'])
    adata.write_zarr(tmp_path / 'test_zarr_dir', chunks=True)
    adata = ad.read_zarr(tmp_path / 'test_zarr_dir')
    assert pd.api.types.is_categorical(adata.obs['oanno1'])
    assert pd.api.types.is_string_dtype(adata.obs['oanno2'])
    assert adata.obs.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
def colsum(zarr_file):
    print("Running colsum for %s" % zarr_file)
    client = Client()
    gcs = gcsfs.GCSFileSystem('hca-scale', token='cloud')
    store = gcsfs.mapping.GCSMap(zarr_file, gcs=gcs)
    adata = ad.read_zarr(store)
    adata.X = da.from_zarr(store, component='X')

    s = np.sum(adata.X, axis=0)
    s.compute()
Example #14
0
    def test_h5ad_to_zarr(self, h5ad_file, tmpdir):
        p = tmpdir.join("filtered_gene_bc_matrices.zarr")
        input = h5ad_file
        output = str(p)
        convert(input, output)

        # read back and check a few things
        adata = read_zarr(output)
        assert adata.X.shape == (2700, 32738)
        assert adata.obs.shape == (2700, 0)
        assert adata.var.shape == (32738, 1)
Example #15
0
def test_zarr_chunk_X(tmp_path):
    import zarr

    zarr_pth = Path(tmp_path) / "test.zarr"
    adata = gen_adata((100, 100), X_type=np.array)
    adata.write_zarr(zarr_pth, chunks=(10, 10))

    z = zarr.open(str(zarr_pth))  # As of v2.3.2 zarr won’t take a Path
    assert z["X"].chunks == (10, 10)
    from_zarr = ad.read_zarr(zarr_pth)
    assert_equal(from_zarr, adata)
 def from_zarr_gcs(cls, sc, gcs_path, gcs_project, gcs_token='cloud'):
     """
     Read a Zarr file from GCS as an anndata object (for the metadata) and with the
     data matrix (X) as an RDD of numpy arrays.
     """
     import gcsfs.mapping
     gcs = gcsfs.GCSFileSystem(gcs_project, token=gcs_token)
     store = gcsfs.mapping.GCSMap(gcs_path, gcs=gcs)
     adata = ad.read_zarr(store)
     chunk_size = zarr.open(store, mode='r')['X'].chunks
     return cls._from_anndata(
         sc, adata, chunk_size,
         read_chunk_zarr_gcs(gcs_path, chunk_size, gcs_project, gcs_token))
    def read_adata(self, filesystem, path):
        path_lc = path.lower()
        path_lc = path_lc.rstrip("/")
        if path_lc.endswith(".loom"):
            adata = anndata.read_loom(filesystem.open(path))
        elif path_lc.endswith(".zarr"):
            adata = anndata.read_zarr(filesystem.get_mapper(path))
        elif path_lc.endswith(".tsv"):
            adata = read_star_fusion_file(filesystem.open(path))
        elif path_lc.endswith(".rds"):  # Seurat, convert to h5ad
            h5_file = path + ".h5ad"
            import os

            if (not os.path.exists(h5_file)
                    or abs(os.path.getmtime(h5_file) - os.path.getmtime(path))
                    > 0.00001):
                import shutil
                import subprocess

                import pkg_resources

                print("Converting Seurat object")
                if os.path.exists(h5_file):
                    os.remove(h5_file)
                subprocess.check_call([
                    "Rscript",
                    pkg_resources.resource_filename("cirrocumulus",
                                                    "seurat2h5ad.R"),
                    path,
                    h5_file,
                ])
                shutil.copystat(path, h5_file)
            adata = anndata.read_h5ad(h5_file,
                                      backed="r" if self.backed else None)
            if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
                print("Using adata.raw")
                adata = anndata.AnnData(X=adata.raw.X,
                                        var=adata.raw.var,
                                        obs=adata.obs,
                                        obsm=adata.obsm,
                                        uns=adata.uns)
        else:
            if self.backed:
                adata = anndata.read_h5ad(path, backed="r")
            else:
                adata = anndata.read_h5ad(filesystem.open(path))
        if "module" in adata.uns:
            adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
                X=adata.uns["module"]["X"], var=adata.uns["module"]["var"])

        return adata
    def adata_dist(self, request):
        # regular anndata except for X, which we replace on the next line
        a = ad.read_zarr(input_file)
        input_file_X = input_file + "/X"
        if request.param == "direct":
            import zappy.direct

            a.X = zappy.direct.from_zarr(input_file_X)
            yield a
        elif request.param == "dask":
            import dask.array as da

            a.X = da.from_zarr(input_file_X)
            yield a
Example #19
0
 def test_write_zarr(self, adata, adata_dist):
     log1p(adata_dist)
     temp_store = zarr.TempStore()
     chunks = adata_dist.X.chunks
     # write metadata using regular anndata
     adata.write_zarr(temp_store, chunks)
     if isinstance(adata_dist.X, da.Array):
         adata_dist.X.to_zarr(temp_store.dir_path("X"))
     else:
         adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks)
     # read back as zarr (without using RDDs) and check it is the same as adata.X
     adata_log1p = ad.read_zarr(temp_store)
     log1p(adata)
     npt.assert_allclose(adata_log1p.X, adata.X)
Example #20
0
def test_backwards_compat_zarr():
    import scanpy as sc
    import zarr

    pbmc_orig = sc.datasets.pbmc68k_reduced()
    # Old zarr writer couldn’t do sparse arrays
    pbmc_orig.raw._X = pbmc_orig.raw.X.toarray()
    del pbmc_orig.uns["neighbors"]

    # This was written out with anndata=0.6.22.post1
    zarrpth = HERE / "data/pbmc68k_reduced_legacy.zarr.zip"
    with zarr.ZipStore(zarrpth, mode="r") as z:
        pbmc_zarr = ad.read_zarr(z)

    assert_equal(pbmc_zarr, pbmc_orig)
Example #21
0
def test_readwrite_equivalent_h5ad_zarr(typ):
    tmpdir = tempfile.TemporaryDirectory()
    tmpdirpth = Path(tmpdir.name)
    h5ad_pth = tmpdirpth / "adata.h5ad"
    zarr_pth = tmpdirpth / "adata.zarr"

    M, N = 100, 101
    adata = gen_adata((M, N), X_type=typ)
    adata.raw = adata

    adata.write_h5ad(h5ad_pth)
    adata.write_zarr(zarr_pth)
    from_h5ad = ad.read_h5ad(h5ad_pth)
    from_zarr = ad.read_zarr(zarr_pth)

    assert_equal(from_h5ad, from_zarr, exact=True)
Example #22
0
    def test_write_zarr(self, adata, adata_dist):
        import dask.array as da
        import zarr

        log1p(adata_dist)
        temp_store = zarr.TempStore()
        chunks = adata_dist.X.chunks
        if isinstance(chunks[0], tuple):
            chunks = (chunks[0][0], ) + chunks[1]
        # write metadata using regular anndata
        adata.write_zarr(temp_store, chunks)
        if isinstance(adata_dist.X, da.Array):
            adata_dist.X.to_zarr(temp_store.dir_path("X"), overwrite=True)
        else:
            adata_dist.X.to_zarr(temp_store.dir_path("X"), chunks)
        # read back as zarr directly and check it is the same as adata.X
        adata_log1p = ad.read_zarr(temp_store)
        log1p(adata)
        npt.assert_allclose(adata_log1p.X, adata.X)
Example #23
0
def test_readwrite_zarr(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.raw = adata_src
    assert not is_categorical(adata_src.obs['oanno1'])
    adata_src.write_zarr(tmp_path / 'test_zarr_dir', chunks=True)

    adata = ad.read_zarr(tmp_path / 'test_zarr_dir')
    assert is_categorical(adata.obs['oanno1'])
    assert not is_categorical(adata.obs['oanno2'])
    assert adata.obs.index.tolist() == ['name1', 'name2', 'name3']
    assert adata.obs['oanno1'].cat.categories.tolist() == ['cat1', 'cat2']
    assert is_categorical(adata.raw.var['vanno2'])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) == type(adata_src.raw.X)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) == type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src)
Example #24
0
def test_zarr_compression(tmp_path):
    from numcodecs import Blosc

    pth = str(Path(tmp_path) / "adata.zarr")
    adata = gen_adata((10, 8))
    compressor = Blosc(cname="zstd", clevel=3, shuffle=Blosc.BITSHUFFLE)
    not_compressed = []

    ad._io.write_zarr(pth, adata, compressor=compressor)

    def check_compressed(key, value):
        if isinstance(value, zarr.Array) and value.shape != ():
            if value.compressor != compressor:
                not_compressed.append(key)

    with zarr.open(str(pth), "r") as f:
        f.visititems(check_compressed)

    if not_compressed:
        msg = "\n\t".join(not_compressed)
        raise AssertionError(
            f"These elements were not compressed correctly:\n\t{msg}")

    assert_equal(adata, ad.read_zarr(pth))
executor = zappy.executor.PywrenExecutor(live_viewer=True,
                                         exclude_modules=None,
                                         ignore_modules=[
                                             'dash', 'dash_html_components',
                                             'dash_core_components', 'dask',
                                             'google_auth_oauthlib', 'pandas',
                                             'pytest'
                                         ])

s3 = s3fs.S3FileSystem()
if s3.exists('sc-tom-test-data/10x-log1p.zarr'):
    s3.rm('sc-tom-test-data/10x-log1p.zarr', recursive=True)
input_zarr = s3fs.mapping.S3Map(
    'sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr', s3=s3)
input_zarr_X = s3fs.mapping.S3Map(
    'sc-tom-test-data/10x/anndata_zarr_2000/10x.zarr/X', s3=s3)
intermediate_zarr = s3fs.mapping.S3Map('sc-tom-test-data/intermediate.zarr',
                                       s3=s3)
output_zarr = s3fs.mapping.S3Map('sc-tom-test-data/10x-log1p.zarr', s3=s3)

# regular anndata except for X
adata = ad.read_zarr(input_zarr)
adata.X = zappy.executor.from_zarr(executor,
                                   input_zarr_X,
                                   intermediate_store=intermediate_zarr)

recipe_zheng17(adata)

adata.X.to_zarr(output_zarr, adata.X.chunks)
def handle_job():
    email = get_auth().auth()["email"]
    database_api = get_database()
    if request.method == "DELETE":
        content = request.get_json(force=True, cache=False)
        job_id = content.get("id", "")
        database_api.delete_job(email, job_id)
        delete_job(job_id)
        return json_response("", 204)
    elif request.method == "POST":
        if os.environ.get("GAE_APPLICATION") is None:  # TODO
            content = request.get_json(force=True, cache=False)
            email, dataset = get_email_and_dataset(content)
            params = content.get("params")
            job_type = content.get("type")
            job_name = content.get("name")
            return dict(id=submit_job(
                database_api=database_api,
                dataset_api=dataset_api,
                email=email,
                dataset=dataset,
                job_name=job_name,
                job_type=job_type,
                params=params,
            ))
        else:
            raise ValueError("Submit job not supported on GAE")
    else:
        job_id = request.args["id"]
        c = request.args["c"]
        is_precomputed = job_id.startswith("cirro-")
        if c == "status" or c == "params":
            if is_precomputed:
                job = dict(status="complete") if c == "status" else dict()
            else:
                job = database_api.get_job(email=email,
                                           job_id=job_id,
                                           return_type=c)
            if job is None:
                return json_response("", 404)  # job deleted
            return json_response(job, 200)
        if c != "result":
            raise ValueError("c must be one of status, params, or result")
        if is_precomputed:  # precomputed result
            email = get_auth().auth()["email"]
            suggested_dataset_id = request.args["ds"]
            dataset = database_api.get_dataset(email, suggested_dataset_id)
            # precomputed results need to be a child of dataset
            dataset["url"] = map_url(dataset["url"])
            job_result = dataset_api.get_result(dataset, job_id)
            if get_scheme(
                    job_result) == "file" and not os.path.exists(job_result):
                return Response(job_result, content_type="application/json")
            else:
                return send_file(job_result)
        job = database_api.get_job(email=email, job_id=job_id, return_type=c)
        if job is None:
            return json_response("", 404)  # job deleted
        import anndata

        if isinstance(job, dict) and "url" in job:
            url = job["url"]
            content_type = job.get("content-type")
            if content_type == "application/h5ad" or content_type == "application/zarr":
                if content_type == "application/h5ad":
                    with get_fs(url).open(url, mode="rb") as f:
                        adata = anndata.read(f)
                else:
                    adata = anndata.read_zarr(get_fs(url).get_mapper(url))
                adata_df = adata_to_df(adata)
                return Response(
                    adata_df.to_json(double_precision=2, orient="records"),
                    content_type="application/json",
                )
            else:
                # URL to JSON or text
                return send_file(url)
        elif isinstance(job, dict):
            return json_response(job)
        elif isinstance(job, anndata.AnnData):
            return Response(
                adata_to_df(job).to_json(double_precision=2, orient="records"),
                content_type="application/json",
            )
        return job
Example #27
0
 def adata(self):
     a = ad.read_zarr(input_file)  # regular anndata
     a.X = a.X[:]  # convert to numpy array
     return a
# pip3 uninstall -y anndata
# pip3 install git+https://github.com/tomwhite/anndata@zarr
# pip3 install gcsfs
# pip3 list

import anndata as ad
import gcsfs.mapping

from scanpy.api.pp import log1p
from scanpy.api.pp import recipe_zheng17

gcs = gcsfs.GCSFileSystem('hca-scale', token='cloud')
store = gcsfs.mapping.GCSMap('ll-sc-data-bkup/10x/anndata_zarr_2000/10x.zarr', gcs=gcs)
#store = gcsfs.mapping.GCSMap('ll-sc-data-bkup/10x/10x-10k-subset.zarr', gcs=gcs)
output = gcsfs.mapping.GCSMap('ll-sc-data-bkup/10x/anndata_zarr_out/10x.zarr', gcs=gcs)
adata = ad.read_zarr(store)

import time
start = time.time()

adata.X = adata.X[:] # materialize in memory since Zarr doesn't support all the operations scanpy calls

recipe_zheng17(adata)

adata.write_zarr(output, chunks=(2000, adata.n_vars))

end = time.time()
print(end - start)

# 1080.5862543582916
# This is 18 minutes - note that it *doesn't* write back to cloud storage
 def read_one_chunk(chunk_index):
     adata = ad.read_zarr(zarr_file)
     return read_adata_chunk(adata, chunk_size, chunk_index)
Example #30
0
def handle_job():
    email = get_auth().auth()['email']
    database_api = get_database()
    if request.method == 'DELETE':
        content = request.get_json(force=True, cache=False)
        job_id = content.get('id', '')
        database_api.delete_job(email, job_id)
        delete_job(job_id)
        return json_response('', 204)
    elif request.method == 'POST':
        if os.environ.get('GAE_APPLICATION') is None:  # TODO
            content = request.get_json(force=True, cache=False)
            email, dataset = get_email_and_dataset(content)
            params = content.get('params')
            job_type = content.get('type')
            job_name = content.get('name')
            return dict(id=submit_job(database_api=database_api, dataset_api=dataset_api, email=email, dataset=dataset,
                                      job_name=job_name, job_type=job_type, params=params))
        else:
            raise ValueError('Submit job not supported on GAE')
    else:
        job_id = request.args['id']
        c = request.args['c']
        is_precomputed = job_id.startswith('cirro-')
        if c == 'status' or c == 'params':
            if is_precomputed:
                job = dict(status='complete') if c == 'status' else dict()
            else:
                job = database_api.get_job(email=email, job_id=job_id, return_type=c)
            if job is None:
                return json_response('', 404)  # job deleted
            return json_response(job, 200)
        if c != 'result':
            raise ValueError('c must be one of status, params, or result')
        if is_precomputed:  # precomputed result
            email = get_auth().auth()['email']
            suggested_dataset_id = request.args['ds']
            dataset = database_api.get_dataset(email, suggested_dataset_id)
            # precomputed results need to be a child of dataset
            dataset['url'] = map_url(dataset['url'])
            job_result = dataset_api.get_result(dataset, job_id)
            if get_scheme(job_result) == 'file' and not os.path.exists(job_result):
                return Response(job_result, content_type='application/json')
            else:
                return send_file(job_result)
        job = database_api.get_job(email=email, job_id=job_id, return_type=c)
        if job is None:
            return json_response('', 404)  # job deleted
        import anndata
        if isinstance(job, dict) and 'url' in job:
            url = job['url']
            content_type = job.get('content-type')
            if content_type == 'application/h5ad' or content_type == 'application/zarr':
                if content_type == 'application/h5ad':
                    with get_fs(url).open(url, mode='rb') as f:
                        adata = anndata.read(f)
                else:
                    adata = anndata.read_zarr(get_fs(url).get_mapper(url))
                adata_df = adata_to_df(adata)
                return Response(adata_df.to_json(double_precision=2, orient='records'), content_type='application/json')
            else:
                # URL to JSON or text
                return send_file(url)
        elif isinstance(job, dict):
            return json_response(job)
        elif isinstance(job, anndata.AnnData):
            return Response(adata_to_df(job).to_json(double_precision=2, orient='records'),
                            content_type='application/json')
        return job