Beispiel #1
0
def test_de_backed(sparse, file_format, tmp_path):
    fs = fsspec.filesystem('file')
    adata = get_example_data(sparse)
    output_dir = str(tmp_path)
    prepare_data = PrepareData(datasets=[adata],
                               output=output_dir,
                               output_format=file_format)
    prepare_data.execute()
    if file_format == 'parquet':
        reader = ParquetDataset()
    elif file_format == 'zarr':
        reader = ZarrDataset()
    batch_size = 30
    obs_field = 'sc_groups'
    nfeatures = adata.shape[1]

    def get_batch_fn(i):
        end = min(nfeatures, i + batch_size)
        return reader.read_dataset(filesystem=fs,
                                   path=output_dir,
                                   dataset=dict(id=''),
                                   keys=dict(X=[slice(i, end)]))

    results = DE(series=adata.obs[obs_field],
                 nfeatures=nfeatures,
                 batch_size=batch_size,
                 get_batch_fn=get_batch_fn,
                 base=get_base(adata),
                 one_vs_rest=True)
    diff_results(adata, obs_field, results.pair2results[0])
Beispiel #2
0
def test_prepare(test_data, measures, dimensions, continuous_obs, basis,
                 file_format, tmp_path):
    file_format2ext = dict(parquet='.cpq', zarr='.zarr')
    output_dir = str(tmp_path / 'test.{}'.format(file_format2ext[file_format]))
    test_data = test_data[:, measures]
    test_data.obs = test_data.obs[dimensions + continuous_obs]
    prepare_data = PrepareData(datasets=[test_data],
                               output=output_dir,
                               output_format=file_format)
    prepare_data.execute()
    if file_format == 'parquet':
        reader = ParquetDataset()
    elif file_format == 'zarr':
        reader = ZarrDataset()
    read_and_diff(reader, output_dir, test_data, measures, dimensions,
                  continuous_obs, basis)
def test_prepare(test_data, measures, dimensions, continuous_obs, basis,
                 file_format, spatial, tmp_path):
    file_format2ext = dict(parquet=".cpq", zarr=".zarr")
    output_dir = str(tmp_path / "test.{}".format(file_format2ext[file_format]))
    test_data = test_data[:, measures]
    test_data.obs = test_data.obs[dimensions + continuous_obs]
    if spatial:
        test_data.uns["images"] = [
            dict(type="image", name="tissue_hires", image=output_dir)
        ]
    prepare_data = PrepareData(datasets=[test_data],
                               output=output_dir,
                               output_format=file_format)
    prepare_data.execute()
    if file_format == "parquet":
        reader = ParquetDataset()
    elif file_format == "zarr":
        reader = ZarrDataset()
    read_and_diff(reader, output_dir, test_data, measures, dimensions,
                  continuous_obs, basis)
Beispiel #4
0
def configure():
    from cirrocumulus.api import dataset_api
    from cirrocumulus.api import auth_api, database_api
    from cirrocumulus.no_auth import NoAuth
    auth_client_id = os.environ.get(CIRRO_AUTH_CLIENT_ID)
    db_uri = os.environ.get(CIRRO_DB_URI, DEFAULT_DB_URI)
    database = os.environ.get(CIRRO_DATABASE, DEFAULT_DATABASE)
    email = os.environ.get(CIRRO_EMAIL)
    os.environ[CIRRO_SERVE] = 'true'
    if auth_client_id is None:
        auth_api.provider = NoAuth()
    else:
        from cirrocumulus.google_auth import GoogleAuth
        auth_api.provider = GoogleAuth(auth_client_id)
    from cirrocumulus.mongo_db import MongoDb
    database_api.provider = MongoDb(db_uri, database, email)
    from cirrocumulus.parquet_dataset import ParquetDataset
    dataset_api.add(ParquetDataset())
    from cirrocumulus.anndata_dataset import AnndataDataset
    anndata_dataset = AnndataDataset('r' if False else None)
    dataset_api.add(anndata_dataset)
Beispiel #5
0
def configure(list_of_dataset_paths, spatial_directories, backed,
              marker_paths):
    from cirrocumulus.api import dataset_api
    from cirrocumulus.api import auth_api, database_api
    from cirrocumulus.local_db_api import LocalDbAPI
    from cirrocumulus.no_auth import NoAuth
    auth_api.provider = NoAuth()
    dataset_api.add(ParquetDataset())
    anndata_dataset = AnndataDataset('r' if backed else None)
    dataset_ids = []
    for dataset_paths in list_of_dataset_paths:
        dataset_paths = dataset_paths.split(',')
        dataset_id = os.path.normpath(dataset_paths[0])
        dataset_ids.append(dataset_id)
        if len(dataset_paths) > 1:
            to_concat = []
            all_ids = None
            for path in dataset_paths:
                print(path)
                d = anndata_dataset.get_data(path)
                all_ids = d.obs.index.union(
                    all_ids) if all_ids is not None else d.obs.index
                to_concat.append(d)
            for i in range(len(to_concat)):
                d = to_concat[i]
                missing_ids = all_ids.difference(d.obs.index)
                if len(missing_ids) > 0:
                    import scipy.sparse
                    import anndata
                    import pandas as pd
                    X = None
                    if d.shape[1] > 0:
                        empty = scipy.sparse.csr_matrix(
                            (len(missing_ids), d.shape[1]))
                        X = scipy.sparse.vstack((d.X, empty), format='csr')
                    missing_df = pd.DataFrame(index=missing_ids)
                    for column in d.obs:
                        if pd.api.types.is_bool_dtype(d.obs[column]):
                            missing_df[column] = False

                    obs = pd.concat((d.obs, missing_df))
                    # for column in d.obs:
                    #     if pd.api.types.is_categorical_dtype(d.obs[column]):
                    #         obs[column] = obs[column].astype('category')
                    d = anndata.AnnData(X=X, obs=obs, var=d.var)
                d = d[all_ids]  # same order
                to_concat[i] = d
            X_list = []
            obs = None
            obsm = {}
            var = None
            for d in to_concat:
                if d.shape[1] > 0:
                    X_list.append(d.X)
                    var = pd.concat((var, d.var)) if var is not None else d.var
                obs = obs.join(d.obs) if obs is not None else d.obs
                for key in d.obsm_keys():
                    obsm[key] = d.obsm[key]

            X = scipy.sparse.hstack(
                X, format='csr') if len(X_list) > 1 else X_list[0]
            adata = anndata.AnnData(X=X, obs=obs, var=var, obsm=obsm)
            adata.var_names_make_unique()
            anndata_dataset.add_data(dataset_id, adata)
        dataset_api.add(anndata_dataset)

    database_api.provider = LocalDbAPI(dataset_ids)

    if spatial_directories is not None and len(spatial_directories) > 0:
        for i in range(len(spatial_directories)):
            spatial_directory = spatial_directories[i]
            if spatial_directory != '':
                adata = anndata_dataset.get_data(dataset_ids[i])
                if not add_spatial(adata, spatial_directory):
                    print('No spatial data found in {}'.format(
                        spatial_directory))

    if marker_paths is not None and len(marker_paths) > 0:
        markers = get_markers(marker_paths)
        for dataset_id in dataset_ids:
            d = anndata_dataset.get_data(dataset_id)
            existing_markers = d.uns.get('markers', [])
            markers += existing_markers
            # remove genes in dict that are not in dataset
            d.uns['markers'] = filter_markers(d, markers)
def configure_app(app, list_of_dataset_paths, spatial_directories,
                  marker_paths):
    from cirrocumulus.api import dataset_api
    from cirrocumulus.no_auth import NoAuth

    try:
        from cirrocumulus.parquet_dataset import ParquetDataset

        dataset_api.add(ParquetDataset())
    except ModuleNotFoundError:
        pass
    try:
        from cirrocumulus.zarr_dataset import ZarrDataset

        dataset_api.add(ZarrDataset())
    except ModuleNotFoundError:
        pass
    app.config[CIRRO_AUTH] = NoAuth()
    os.environ[CIRRO_JOB_TYPE + "de"] = "cirrocumulus.job_api.run_de"
    anndata_dataset = AnndataDataset()
    dataset_ids = []
    for dataset_paths in list_of_dataset_paths:
        dataset_paths = dataset_paths.split(",")
        dataset_id = dataset_paths[0]
        dataset_ids.append(dataset_id)
        if len(dataset_paths) > 1:
            datasets = []
            for i in range(len(dataset_paths)):
                dataset = anndata_dataset.get_data(get_fs(dataset_paths[i]),
                                                   dataset_paths[i])
                if "group" not in dataset.var:
                    dataset.var["group"] = dataset.uns.get(
                        "name", "dataset {}".format(i + 1))
                datasets.append(dataset)
            adata = anndata.concat(datasets,
                                   axis=1,
                                   label="group",
                                   merge="unique")
            dataset.obsm = datasets[0].obsm
            adata.var.index = adata.var.index.str.replace("/", "_")
            adata.var_names_make_unique()
            anndata_dataset.add_data(dataset_id, adata)
        dataset_api.add(anndata_dataset)

    app.config[CIRRO_DATABASE] = LocalDbAPI(dataset_ids)

    if spatial_directories is not None and len(spatial_directories) > 0:
        for i in range(len(spatial_directories)):
            spatial_directory = spatial_directories[i]
            if spatial_directory != "":
                adata = anndata_dataset.get_data(get_fs(dataset_ids[i]),
                                                 dataset_ids[i])
                if not add_spatial(adata, spatial_directory):
                    print("No spatial data found in {}".format(
                        spatial_directory))

    if marker_paths is not None and len(marker_paths) > 0:
        markers = get_markers(marker_paths)
        for dataset_id in dataset_ids:
            d = anndata_dataset.get_data(get_fs(dataset_id), dataset_id)
            existing_markers = d.uns.get("markers", [])
            markers += existing_markers
            # remove genes in dict that are not in dataset
            d.uns["markers"] = filter_markers(d, markers)