Beispiel #1
0
def test_readloom_deprecations(tmp_path):
    loom_pth = tmp_path / "test.loom"
    adata_src = gen_adata((5, 10),
                          obsm_types=[np.ndarray],
                          varm_types=[np.ndarray])
    adata_src.write_loom(loom_pth, write_obsm_varm=True)

    # obsm_names -> obsm_mapping
    obsm_mapping = {"df": adata_src.obs.columns}
    with pytest.warns(FutureWarning):
        depr_result = ad.read_loom(loom_pth, obsm_names=obsm_mapping)
    actual_result = ad.read_loom(loom_pth, obsm_mapping=obsm_mapping)
    assert_equal(actual_result, depr_result)
    with pytest.raises(ValueError, match="ambiguous"):
        ad.read_loom(loom_pth,
                     obsm_mapping=obsm_mapping,
                     obsm_names=obsm_mapping)

    # varm_names -> varm_mapping
    varm_mapping = {"df": adata_src.var.columns}
    with pytest.warns(FutureWarning):
        depr_result = ad.read_loom(loom_pth, varm_names=varm_mapping)
    actual_result = ad.read_loom(loom_pth, varm_mapping=varm_mapping)
    assert_equal(actual_result, depr_result)
    with pytest.raises(ValueError, match="ambiguous"):
        ad.read_loom(loom_pth,
                     varm_mapping=varm_mapping,
                     varm_names=varm_mapping)

    # positional -> keyword
    with pytest.warns(FutureWarning, match="sparse"):
        depr_result = ad.read_loom(loom_pth, True)
    actual_result = ad.read_loom(loom_pth, sparse=True)
    assert type(depr_result.X) == type(actual_result.X)
Beispiel #2
0
def loom_to_csv(args) -> None:
    u"""
    Convert velocyte loom file to csv
    :param loom_path:
    :param output:
    :return:
    """
    loom_path, output = args
    if not os.path.exists(output):
        os.makedirs(output)

    logger.info("Loading from {0}".format(loom_path))
    data = anndata.read_loom(os.path.abspath(loom_path))

    logger.info("Feature of {0}".format(os.path.basename(loom_path)))
    data.var.to_csv(os.path.join(output, "var.csv.gz"))

    logger.info("Barcode of {0}".format(os.path.basename(loom_path)))
    data.obs.to_csv(os.path.join(output, "obs.csv.gz"))

    for i in ["matrix", "ambiguous", "spliced", "unspliced"]:
        logger.info("{1} of {0}".format(os.path.basename(loom_path), i))

        temp = pd.DataFrame(data.layers[i].todense())
        temp.columns = data.var.index
        temp["index"] = data.obs.index
        temp = temp.melt(id_vars="index")
        temp = temp.loc[temp["value"] > 0, :]
        temp.to_csv(os.path.join(output, "{0}.csv.gz".format(i)))
Beispiel #3
0
def test_readwrite_loom(typ, obsm_names, varm_names, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.obsm["X_a"] = np.zeros((adata_src.n_obs, 2))
    adata_src.varm["X_b"] = np.zeros((adata_src.n_vars, 3))
    adata_src.write_loom(tmp_path / "test.loom", write_obsm_varm=True)

    adata = ad.read_loom(
        tmp_path / "test.loom",
        sparse=typ is csr_matrix,
        obsm_names=obsm_names,
        varm_names=varm_names,
        cleanup=True,
    )
    if isinstance(X, np.ndarray):
        assert np.allclose(adata.X, X)
    else:
        # TODO: this should not be necessary
        assert np.allclose(adata.X.toarray(), X.toarray())
    assert "X_a" in adata.obsm_keys() and adata.obsm["X_a"].shape[1] == 2
    assert "X_b" in adata.varm_keys() and adata.varm["X_b"].shape[1] == 3
    # as we called with `cleanup=True`
    assert "oanno1b" in adata.uns["loom-obs"]
    assert "vanno2" in adata.uns["loom-var"]
    for k, v in obsm_names.items():
        assert k in adata.obsm_keys() and adata.obsm[k].shape[1] == len(v)
    for k, v in varm_names.items():
        assert k in adata.varm_keys() and adata.varm[k].shape[1] == len(v)
Beispiel #4
0
def run(args):
    """Compile an AnnData object from a loom file
    
    """
    # Parse options...
    options = args
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    # take options
    h5_fname = options.ifile
    out_fname = options.ofile

    # read
    adata = ad.read_loom(h5_fname, sparse=True)

    # add n_counts and n_kmers to adata.obs
    adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
    adata.obs['n_kmers'] = np.sum(adata.X > 0, axis=1).A1

    # add n_cells to adata.var
    adata.var['n_cells'] = np.sum(adata.X > 0, axis=0).A1

    # save adata to h5ad
    adata.write_h5ad(filename=out_fname)
    return
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith(".loom"):
        adata = anndata.read_loom(path)
    elif path.lower().endswith(".zarr"):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if "module" in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns["module"]["X"], var=adata.uns["module"]["var"]
        )
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
        logger.info("Using adata.raw")
        adata = anndata.AnnData(
            X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns
        )

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info("No spatial data found in {}".format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(adata.obs[field]):
            logger.info("Converting {} to categorical".format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype("category")
    return adata
Beispiel #6
0
 def read_adata(self, path):
     path_lc = path.lower()
     if path_lc.endswith('.loom'):
         return anndata.read_loom(path)
     elif path_lc.endswith('.zarr'):
         return anndata.read_zarr(path)
     elif path_lc.endswith('.tsv'):
         return read_star_fusion_file(path)
     elif path_lc.endswith('.rds'):  # Seurat, convert to h5ad
         h5_file = path + '.h5ad'
         import os
         if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001:
             import subprocess
             import pkg_resources
             import shutil
             print('Converting Seurat object')
             if os.path.exists(h5_file):
                 os.remove(h5_file)
             subprocess.check_call(
                 ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file])
             shutil.copystat(path, h5_file)
         adata = anndata.read(h5_file, backed=self.backed)
         if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
             print('Using adata.raw')
             adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns)
         return adata
     return anndata.read(path, backed=self.backed)
Beispiel #7
0
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith('.loom'):
        adata = anndata.read_loom(path)
    elif path.lower().endswith('.zarr'):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if 'module' in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns['module']['X'], var=adata.uns['module']['var'])
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[
            0]:
        logger.info('Using adata.raw')
        adata = anndata.AnnData(X=adata.raw.X,
                                var=adata.raw.var,
                                obs=adata.obs,
                                obsm=adata.obsm,
                                uns=adata.uns)

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info(
                'No spatial data found in {}'.format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(
                adata.obs[field]):
            logger.info('Converting {} to categorical'.format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype('category')
    return adata
Beispiel #8
0
def importLoom(inFname):
    " load a loom file with anndata and fix up the obsm attributes "
    import pandas as pd
    import anndata
    ad = anndata.read_loom(inFname)

    coordKeyList = (["_tSNE1", "_tSNE2"], ["_X", "_Y"], ["UMAP1", "UMAP2"],
                    ['Main_cluster_umap_1', 'Main_cluster_umap_2'])
    obsKeys = getObsKeys(ad)
    foundCoords = False
    for coordKeys in coordKeyList:
        if coordKeys[0] in obsKeys and coordKeys[1] in obsKeys:
            logging.debug(
                "Found %s in anndata.obs, moving these fields into obsm" %
                repr(coordKeys))
            newObj = pd.concat([ad.obs[coordKeys[0]], ad.obs[coordKeys[1]]],
                               axis=1)
            ad.obsm["tsne"] = newObj
            del ad.obs[coordKeys[0]]
            del ad.obs[coordKeys[1]]
            foundCoords = True
            break

    if not foundCoords:
        logging.warn(
            "Did not find any keys like %s in anndata.obs, cannot import coordinates"
            % repr(coordKeyList))
    return ad
Beispiel #9
0
def load_file(filepath):
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            # TODO remove transpose
            adata = anndata.read_csv(filepath).T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[
                -4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    adata.uns['dataset'] = dataset
    return adata
Beispiel #10
0
 def read_adata(self, filesystem, path):
     path_lc = path.lower()
     path_lc = path_lc.rstrip('/')
     if path_lc.endswith('.loom'):
         adata = anndata.read_loom(filesystem.open(path))
     elif path_lc.endswith('.zarr'):
         adata = anndata.read_zarr(filesystem.get_mapper(path))
     elif path_lc.endswith('.tsv'):
         adata = read_star_fusion_file(filesystem.open(path))
     elif path_lc.endswith('.rds'):  # Seurat, convert to h5ad
         h5_file = path + '.h5ad'
         import os
         if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001:
             import subprocess
             import pkg_resources
             import shutil
             print('Converting Seurat object')
             if os.path.exists(h5_file):
                 os.remove(h5_file)
             subprocess.check_call(
                 ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file])
             shutil.copystat(path, h5_file)
         adata = anndata.read_h5ad(h5_file, backed='r' if self.backed else None)
         if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
             print('Using adata.raw')
             adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns)
     else:
         adata = anndata.read_h5ad(filesystem.open(path), backed='r' if self.backed else None)
     if 'module' in adata.uns:
         adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(X=adata.uns['module']['X'],
                                                           var=adata.uns['module']['var'])
     return adata
Beispiel #11
0
def get_adata(url, filename=None):
    """Download example data to local folder.

    Parameters
    ----------
        url:
        filename

    Returns
    -------
        adata: :class:`~anndata.AnnData`
            an Annodata object.
    """

    filename = ntpath.basename(url) if filename is None else filename

    filename = "./data/" + filename
    if not os.path.exists(filename):
        if not os.path.exists("./data/"):
            os.mkdir("data")

        urlretrieve(url, filename)  # download the data

    if Path(filename).suffixes[-1][1:] == "loom":
        adata = read_loom(filename=filename)
    elif Path(filename).suffixes[-1][1:] == "h5ad":
        adata = read_h5ad(filename=filename)

    adata.var_names_make_unique()

    return adata
Beispiel #12
0
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords):
    """
    Read h5ad, loom, mtx, 10X h5, and csv formatted files

    Parameters
    ----------
    path: str
        File name of data file.
    obs: {str, pd.DataFrame}
        Path to obs data file or a data frame
    var: {str, pd.DataFrame}
        Path to var data file or a data frame
    obs_filter {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    var_filter: {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    Returns
    -------
    Annotated data matrix.
    """

    _, ext = os.path.splitext(str(path).lower())
    if ext == '.txt':
        df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0)
        adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    elif ext == '.h5ad':
        adata = anndata.read(path)
    elif ext == '.loom':
        adata = anndata.read_loom(path)
    elif ext == '.mtx':
        adata = anndata.read_mtx(path)
    elif ext == '.zarr':
        adata = anndata.read_zarr(path)
    else:
        raise ValueError('Unknown file format: {}'.format(ext))

    def get_df(meta):
        if not isinstance(meta, pd.DataFrame):
            tmp_path = None
            if meta.startswith('gs://'):
                tmp_path = download_gs_url(meta)
                meta = tmp_path
            meta = pd.read_csv(meta, sep=None, index_col='id', engine='python')
            if tmp_path is not None:
                os.remove(tmp_path)
        return meta

    if obs is not None:
        if not isinstance(obs, list) and not isinstance(obs, tuple):
            obs = [obs]
        for item in obs:
            adata.obs = adata.obs.join(get_df(item))
    if var is not None:
        if not isinstance(var, list) and not isinstance(var, tuple):
            var = [var]
        for item in var:
            adata.var = adata.var.join(get_df(item))

    return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
Beispiel #13
0
def test_readwrite_loom(tmp_path):
    loom_path = tmp_path / "test.loom"
    adata = AnnData(X=X, layers=dict(L=L.copy()))
    adata.write_loom(loom_path)
    adata_read = read_loom(loom_path, X_name="")

    assert adata.layers.keys() == adata_read.layers.keys()
    assert (adata.layers["L"] == adata_read.layers["L"]).all()
Beispiel #14
0
def test_readwrite_loom(tmp_path):
    loom_path = Path(tmp_path / 'test.loom')
    adata = ad.AnnData(X=X, layers={'L': L.copy()})
    adata.write_loom(loom_path)
    adata_read = ad.read_loom(loom_path, X_name='')

    assert adata.layers.keys() == adata_read.layers.keys()
    assert (adata.layers['L'] == adata_read.layers['L']).all()
Beispiel #15
0
def run_Seurat(datasets, task, task_adata, method_name, log_dir, args):
    method_key = '{}_aligned'.format(method_name)

    with tempfile.TemporaryDirectory() as tmp_dir:
        working_dir = Path(tmp_dir)
        print("saving data for Seurat")
        #task_adata.write('_tmp_adata_for_seurat.h5ad')
        if args.input_space == 'PCA':
            df = pd.DataFrame(task_adata.obsm['PCA'], index=task_adata.obs.index)
        else:
            df = task_adata.to_df()
        print(df.shape)
        #print(df.index)
        #print(df.columns)
        count_file = working_dir / '_tmp_counts.csv'
        df.T.to_csv(count_file)
        metadata_file = working_dir / '_tmp_meta.csv'
        task_adata.obs.to_csv(metadata_file)
        loom_result_file = working_dir / '_tmp_adata_for_seurat.loom'
        # Run seurat
        #cmd = "C:\\Users\\samir\\Anaconda3\\envs\\seuratV3\\Scripts\\Rscript.exe  seurat_align.R {}".format(task.batch_key)
        seurat_env_path = Path(args.seurat_env_path)
        if platform.system() == 'Windows':
            bin_path = seurat_env_path / 'Library' / 'mingw-w64' / 'bin'
            rscript_path = seurat_env_path / 'Scripts' / 'Rscript.exe'
            cmd = 'set PATH={};%PATH% && {} seurat_align.R {} {} {} {} {}'.format(bin_path, rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file)
            cmd = cmd.split()
        else:
            bin_path = seurat_env_path / 'bin' 
            rscript_path = bin_path / 'Rscript'
            cmd = 'PATH="{}:$PATH" {} seurat_align.R {} {} {} {} {}'.format(bin_path, rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file)
            #cmd = '{} seurat_align.R {} {} {} {} {}'.format(rscript_path, task.batch_key, args.seurat_dims, count_file, metadata_file, loom_result_file)

        #cmd = r"set PATH=C:\Users\samir\Anaconda3\envs\seuratV3\Library\mingw-w64\bin;%PATH% && C:\Users\samir\Anaconda3\envs\seuratV3\Scripts\Rscript.exe  seurat_align.R {}".format(task.batch_key)
        print('Running command: {}'.format(cmd))
        try:
            t0 = datetime.datetime.now()
            console_output = subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
            t1 = datetime.datetime.now()
            time_str = pretty_tdelta(t1 - t0)
            print(f'took: {time_str}')
            with open(log_dir / 'fit_time.txt', 'w') as f:
                f.write(time_str + '\n')
            console_output = console_output.stdout.decode('UTF-8')
            print('Finished running')
            print(console_output)
            aligned_adata = anndata.read_loom(loom_result_file)
            print('done loading loom')
            print(aligned_adata.shape)
            #print(type(aligned_adata.X))
            # print(aligned_adata.obs.columns)
            # print(aligned_adata.obsm.keys())
            # print('todense...')
            task_adata.obsm[method_key] = aligned_adata.X.todense()
            # print(task_adata.obsm[method_key][:5, :])
        except subprocess.CalledProcessError as e:
            print("RUNNING SEURAT FAILED")
            print(e.stdout.decode('UTF-8'))
Beispiel #16
0
def test_readwrite_loom():
    adata = ad.AnnData(X=X, layers={'L': L.copy()})
    adata.write_loom('test.loom')
    adata_read = ad.read_loom('test.loom', X_name='')

    assert adata.layers.keys() == adata_read.layers.keys()
    assert (adata.layers['L'] == adata_read.layers['L']).all()

    os.remove('test.loom')
Beispiel #17
0
def test_readwrite_loom(typ, tmp_path):
    X = typ(X_list)
    adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata.write_loom(tmp_path / 'test.loom')
    adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix)
    if isinstance(X, np.ndarray):
        assert np.allclose(adata.X, X)
    else:
        # TODO: this should not be necessary
        assert np.allclose(adata.X.toarray(), X.toarray())
Beispiel #18
0
def test_readwrite_loom():
    for i, typ in enumerate([np.array, csr_matrix]):
        X = typ(X_list)
        adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
        adata.write_loom('./test.loom')
        adata = ad.read_loom('./test.loom', sparse=(i == 1))
        if isinstance(X, np.ndarray):
            assert np.allclose(adata.X, X)
        else:
            # TODO: this should not be necessary
            assert np.allclose(adata.X.toarray(), X.toarray())
Beispiel #19
0
def test_kazu_new():
    dir_path = '../data/MCF10A_exp1/'
    exp1_cbc_gbc_frame = pd.read_csv(os.path.join(dir_path,
                                                  'CBC_GBC_summary.txt'),
                                     delimiter='\t')
    exp2_cbc_gbc_frame = pd.read_csv(os.path.join(dir_path, 'CBC_GBC.txt'),
                                     delimiter='\t')
    kazu_onedrive_exp1_cbc2gbc_mapping = get_mapping(exp1_cbc_gbc_frame)
    kazu_onedrive_exp1_cbcs = list(kazu_onedrive_exp1_cbc2gbc_mapping.keys())
    kazu_onedrive_exp2_cbc2gbc_mapping = get_mapping(exp2_cbc_gbc_frame)
    kazu_onedrive_exp2_cbcs = list(kazu_onedrive_exp2_cbc2gbc_mapping.keys())

    ann_obj = anndata.read_loom(
        os.path.join(dir_path, 'possorted_genome_bam_RIG79.loom'))
    adata_cbc_codes = [x[x.find(':') + 1:] for x in list(ann_obj.obs_names)]
    # ann_obj = anndata.read_h5ad(os.path.join(dir_path, 'adata.h5ad'))
    # adata_cbc_codes = list(ann_obj.obs_names)

    kazu_onedrive_exp1_barcodes = pd.read_csv(os.path.join(
        dir_path, 'barcodes.tsv'),
                                              header=None,
                                              delimiter='\t')[0]
    kazu_onedrive_exp1_barcodes = [s[:-2] for s in kazu_onedrive_exp1_barcodes]
    whitelist_barcodes = pd.read_csv(os.path.join(dir_path,
                                                  '10xv2_whitelist.txt'),
                                     header=None,
                                     delimiter='\t')[0]
    print('sample outputs:')
    print(sorted(list(adata_cbc_codes))[:10])
    print(sorted(kazu_onedrive_exp1_cbcs)[:10])
    print(sorted(kazu_onedrive_exp1_barcodes)[:10])

    print('total obs (cbc) in ann data:', len(adata_cbc_codes))
    print('len of barcode onedrive file:', len(kazu_onedrive_exp1_barcodes))
    print('len of mapping onedrive file:', len(kazu_onedrive_exp1_cbcs))
    print('len of whitelist:', len(whitelist_barcodes))
    print('matched #cell barcodes between adata and kazu onedrive exp1:',
          len(set(adata_cbc_codes) & set(kazu_onedrive_exp1_cbcs)))
    print('matched #cell barcodes between adata and kazu onedrive exp2:',
          len(set(adata_cbc_codes) & set(kazu_onedrive_exp2_cbcs)))
    print('matched #cell barcodes between kazu onedrive barcodes and adata:',
          len(set(adata_cbc_codes) & set(kazu_onedrive_exp1_barcodes)))
    print(
        'matched #cell barcodes between kazu onedrive barcodes and kazu onedrive CBC_GBC mapping:',
        len(set(kazu_onedrive_exp1_cbcs) & set(kazu_onedrive_exp1_barcodes)))

    print('matched #cell barcodes between whitelist barcodes and annData:',
          len(set(whitelist_barcodes) & set(adata_cbc_codes)))
    print(
        'matched #cell barcodes between whitelist barcodes and kazu onedrive barcodes:',
        len(set(whitelist_barcodes) & set(kazu_onedrive_exp1_barcodes)))
    print(
        'matched #cell barcodes between whitelist barcodes and kazu onedrive CBC_GBC mapping:',
        len(set(kazu_onedrive_exp1_cbcs) & set(whitelist_barcodes)))
    def read_adata(self, filesystem, path):
        path_lc = path.lower()
        path_lc = path_lc.rstrip("/")
        if path_lc.endswith(".loom"):
            adata = anndata.read_loom(filesystem.open(path))
        elif path_lc.endswith(".zarr"):
            adata = anndata.read_zarr(filesystem.get_mapper(path))
        elif path_lc.endswith(".tsv"):
            adata = read_star_fusion_file(filesystem.open(path))
        elif path_lc.endswith(".rds"):  # Seurat, convert to h5ad
            h5_file = path + ".h5ad"
            import os

            if (not os.path.exists(h5_file)
                    or abs(os.path.getmtime(h5_file) - os.path.getmtime(path))
                    > 0.00001):
                import shutil
                import subprocess

                import pkg_resources

                print("Converting Seurat object")
                if os.path.exists(h5_file):
                    os.remove(h5_file)
                subprocess.check_call([
                    "Rscript",
                    pkg_resources.resource_filename("cirrocumulus",
                                                    "seurat2h5ad.R"),
                    path,
                    h5_file,
                ])
                shutil.copystat(path, h5_file)
            adata = anndata.read_h5ad(h5_file,
                                      backed="r" if self.backed else None)
            if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
                print("Using adata.raw")
                adata = anndata.AnnData(X=adata.raw.X,
                                        var=adata.raw.var,
                                        obs=adata.obs,
                                        obsm=adata.obsm,
                                        uns=adata.uns)
        else:
            if self.backed:
                adata = anndata.read_h5ad(path, backed="r")
            else:
                adata = anndata.read_h5ad(filesystem.open(path))
        if "module" in adata.uns:
            adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
                X=adata.uns["module"]["X"], var=adata.uns["module"]["var"])

        return adata
Beispiel #21
0
def test_readwrite_loom(typ, tmp_path):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_src.obsm['X_a'] = np.zeros((adata_src.n_obs, 2))
    adata_src.varm['X_b'] = np.zeros((adata_src.n_vars, 3))
    adata_src.write_loom(tmp_path / 'test.loom', write_obsm_varm=True)

    adata = ad.read_loom(tmp_path / 'test.loom', sparse=typ is csr_matrix)
    if isinstance(X, np.ndarray):
        assert np.allclose(adata.X, X)
    else:
        # TODO: this should not be necessary
        assert np.allclose(adata.X.toarray(), X.toarray())
    assert 'X_a' in adata.obsm_keys() and adata.obsm['X_a'].shape[1] == 2
    assert 'X_b' in adata.varm_keys() and adata.varm['X_b'].shape[1] == 3
Beispiel #22
0
def load_file(filepath):
    t_flag = False
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
        t_flag = True
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            adata = anndata.read_csv(filepath)
            if t_flag:
                adata = adata.T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    # Make sure cluster names are in proper format
    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)

    adata.uns['dataset'] = dataset
    return adata
Beispiel #23
0
def fetch_anndata(path, from_gcs):
  """Reads the input data and turns it into an anndata.AnnData object."""
  _, ext = os.path.splitext(path)

  # AnnData is based of HDF5 and doesn't have GCS file handlers
  # so we have to locally copy the file before reading it.
  if from_gcs:
    with tempfile.NamedTemporaryFile(delete=False) as tmp_file:
      tmp_path = tmp_file.name
    tf.io.gfile.copy(path, tmp_path, overwrite=True)
    path = tmp_path

  if ext == '.h5ad':
    adata = anndata.read_h5ad(path)
  elif ext == '.loom':
    adata = anndata.read_loom(path)
  else:
    raise app.UsageError('Only supports loom and h5ad files.')

  return adata
Beispiel #24
0
def get_data(dataset):
    """Download scycle example dataset

    Parameters
    ----------
    dataset: str
        dataset is a string with the name of the dataset to be downloaded.
        Must be one of: 'CHLA9' or 'sc200_CCLE'
    """
    #-- Get cache location
    cache_dir = os.path.dirname(os.path.realpath(__file__))

    #-- Check if cached, download otherwise
    #------ CHLA9
    if dataset == 'CHLA9':
        fname = cache_dir + '/chla9.h5ad'
        if 'chla9.h5ad' not in os.listdir(cache_dir):
            print('-- Downloading CHLA9 data from Xfer...')
            _download_scdata(url_chla9, fname)
            print('-- Download concluded.')
    elif dataset == 'sc200_CCLE':
        fname = cache_dir + '/sc200_ccle.h5ad'
        if 'sc200_ccle.h5ad' not in os.listdir(cache_dir):
            print('-- Downloading sc200_ccle data from Xfer...')
            _download_scdata(url_sc200, fname)
            print('-- Download concluded.')
    else:
        print("Dataset not in list of supported datasets. Must be one of:" +
              ', '.join(datasets))
        return None

    #-- Load from cache
    print('-- Loading data from cache...')
    if len(re.findall('loom$', fname)) > 0:
        scdata = anndata.read_loom(fname)
        scdata.var_names_make_unique()
        print('Done.')
    elif len(re.findall('h5ad$', fname)) > 0:
        scdata = anndata.read_h5ad(fname)

    return scdata
Beispiel #25
0
def read_adata(path, backed=False, spatial_directory=None, use_raw=False):
    import anndata
    adata = anndata.read_loom(path) if path.lower().endswith(
        '.loom') else anndata.read(path, backed=backed)
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[
            0]:
        logger.info('Using adata.raw')
        adata = anndata.AnnData(X=adata.raw.X,
                                var=adata.raw.var,
                                obs=adata.obs,
                                obsm=adata.obsm,
                                uns=adata.uns)

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            print('No spatial data found in {}'.format(spatial_directory))
    if not backed:
        if scipy.sparse.issparse(adata.X) and scipy.sparse.isspmatrix_csr(
                adata.X):
            adata.X = adata.X.tocsc()

    def fix_column_names(df):
        rename = {}
        for c in df.columns:
            if c.find(' ') != -1:
                rename[c] = c.replace(' ', '_')
        return df.rename(rename, axis=1) if len(rename) > 0 else df

    adata.obs = fix_column_names(adata.obs)
    adata.var = fix_column_names(adata.var)
    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(
                adata.obs[field]):
            logger.info('Converting {} to categorical'.format(field))
            adata.obs[field] = adata.obs[field].astype('category')
    for key in adata.obsm:
        if key.find(' ') != -1:
            new_key = key.replace(' ', '_')
            adata.obsm[new_key] = adata.obsm[key]
            del adata.obsm[key]
    return adata
Beispiel #26
0
def _read(filename,
          backed=False,
          sheet=None,
          ext=None,
          delimiter=None,
          first_column_names=None,
          backup_url=None,
          cache=False,
          suppress_cache_warning=False,
          **kwargs):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n' +
                         avail_exts)
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = check_datafile_present_and_download(filename,
                                                     backup_url=backup_url)
    if not is_present: logg.msg('... did not find original file', filename)
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.msg('reading sheet', sheet, 'from file', filename, v=4)
            return read_hdf(filename, sheet)
    # read other file types
    filename_cache = (
        settings.cachedir +
        filename.lstrip('./').replace('/', '-').replace('.' + ext, '.h5ad'))
    if filename_cache.endswith('.gz'): filename_cache = filename_cache[:-3]
    if filename_cache.endswith('.bz2'): filename_cache = filename_cache[:-4]
    if cache and os.path.exists(filename_cache):
        logg.info('... reading from cache file', filename_cache)
        adata = read_h5ad(filename_cache, backed=False)
    else:
        if not is_present:
            raise FileNotFoundError('Did not find file {}.'.format(filename))
        logg.msg('reading', filename, v=4)
        if not cache and not suppress_cache_warning:
            logg.hint(
                'This might be very slow. Consider passing `cache=True`, '
                'which enables much faster reading from a cache file.')
        # do the actual reading
        if ext == 'xlsx' or ext == 'xls':
            if sheet is None:
                raise ValueError(
                    'Provide `sheet` parameter when reading \'.xlsx\' files.')
            else:
                adata = read_excel(filename, sheet)
        elif ext in {'mtx', 'mtx.gz'}:
            adata = read_mtx(filename)
        elif ext == 'csv':
            adata = read_csv(filename, first_column_names=first_column_names)
        elif ext in {'txt', 'tab', 'data', 'tsv'}:
            if ext == 'data':
                logg.msg(
                    '... assuming \'.data\' means tab or white-space '
                    'separated text file',
                    v=3)
                logg.hint('change this by passing `ext` to sc.read')
            adata = read_text(filename, delimiter, first_column_names)
        elif ext == 'soft.gz':
            adata = _read_softgz(filename)
        elif ext == 'loom':
            adata = read_loom(filename=filename, **kwargs)
        else:
            raise ValueError('Unkown extension {}.'.format(ext))
        if cache:
            logg.info('... writing an', settings.file_format_data,
                      'cache file to speedup reading next time')
            if not os.path.exists(os.path.dirname(filename_cache)):
                os.makedirs(os.path.dirname(filename_cache))
            # write for faster reading when calling the next time
            adata.write(filename_cache)
    return adata
Beispiel #27
0
import numpy as np
import pandas as pd
import anndata
import loompy
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, '/home/fabio/university/postdoc/northstar/build/lib')
import northstar

if __name__ == '__main__':

    print('Load GBM data')
    fdn_gbm = '../data/GBM_data_and_metadata/'
    fn_loom_gbm = fdn_gbm + 'GBM_data.loom'
    adata_gbm = anndata.read_loom(fn_loom_gbm, sparse=False)
    adata_gbm.var_names = adata_gbm.var['GeneName']
    adata_gbm.X = 1e6 * (adata_gbm.X.T / adata_gbm.X.sum(axis=1)).T
    adata_gbm.obs['CellType'] = adata_gbm.obs['Cell_type']

    print('Load Darmanis atlas')
    af = northstar.AtlasFetcher()
    adata_dmnf = af.fetch_atlas('Darmanis_2015_nofetal', kind='subsample')

    print('Load Velmeshev autism atlas labndmarks')
    fdn = '../data/Autism/'
    fn_loom = fdn + 'subsample.loom'
    adata = anndata.read_loom(fn_loom, sparse=False)
    adata.X = adata.X * 100
    adata.var_names = adata.var['GeneName']
    adata.obs['CellType'] = adata.obs['cluster']
Beispiel #28
0
def _read(
    filename: Path,
    backed=None,
    sheet=None,
    ext=None,
    delimiter=None,
    first_column_names=None,
    backup_url=None,
    cache=False,
    cache_compression=None,
    suppress_cache_warning=False,
    **kwargs,
):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         f'{avail_exts}')
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = _check_datafile_present_and_download(
        filename,
        backup_url=backup_url,
    )
    if not is_present:
        logg.debug(f'... did not find original file {filename}')
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.debug(f'reading sheet {sheet} from file {filename}')
            return read_hdf(filename, sheet)
    # read other file types
    path_cache = settings.cachedir / _slugify(filename).replace(
        '.' + ext, '.h5ad')  # type: Path
    if path_cache.suffix in {'.gz', '.bz2'}:
        path_cache = path_cache.with_suffix('')
    if cache and path_cache.is_file():
        logg.info(f'... reading from cache file {path_cache}')
        return read_h5ad(path_cache)

    if not is_present:
        raise FileNotFoundError(f'Did not find file {filename}.')
    logg.debug(f'reading {filename}')
    if not cache and not suppress_cache_warning:
        logg.hint('This might be very slow. Consider passing `cache=True`, '
                  'which enables much faster reading from a cache file.')
    # do the actual reading
    if ext == 'xlsx' or ext == 'xls':
        if sheet is None:
            raise ValueError(
                "Provide `sheet` parameter when reading '.xlsx' files.")
        else:
            adata = read_excel(filename, sheet)
    elif ext in {'mtx', 'mtx.gz'}:
        adata = read_mtx(filename)
    elif ext == 'csv':
        adata = read_csv(filename, first_column_names=first_column_names)
    elif ext in {'txt', 'tab', 'data', 'tsv'}:
        if ext == 'data':
            logg.hint(
                "... assuming '.data' means tab or white-space "
                'separated text file', )
            logg.hint('change this by passing `ext` to sc.read')
        adata = read_text(filename, delimiter, first_column_names)
    elif ext == 'soft.gz':
        adata = _read_softgz(filename)
    elif ext == 'loom':
        adata = read_loom(filename=filename, **kwargs)
    else:
        raise ValueError(f'Unknown extension {ext}.')
    if cache:
        logg.info(f'... writing an {settings.file_format_data} '
                  'cache file to speedup reading next time')
        if cache_compression is _empty:
            cache_compression = settings.cache_compression
        if not path_cache.parent.is_dir():
            path_cache.parent.mkdir(parents=True)
        # write for faster reading when calling the next time
        adata.write(path_cache, compression=cache_compression)
    return adata
Beispiel #29
0
import numpy as np
import pandas as pd
import anndata
import loompy
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, '/home/fabio/university/postdoc/northstar/build/lib')
import northstar

if __name__ == '__main__':

    print('Load autism data (subsampled)')
    fdn = '../data/Autism/'
    fn_loom = fdn + 'subsample_control.loom'
    adata = anndata.read_loom(fn_loom, sparse=False)
    adata.X = adata.X * 100
    adata.var_names = adata.var['GeneName']
    adata.obs['CellType'] = adata.obs['cluster']
    adata.obs['CellType'].replace(
        {
            'L2/3': 'Neuron',
            'L5/6': 'Neuron',
            'L4': 'Neuron',
            'L5/6-CC': 'Neuron',
            'IN-VIP': 'Neuron',
            'IN-PV': 'Neuron',
            'IN-SV2C': 'Neuron',
            'IN-SST': 'Neuron',
            'Neu-NRGN-I': 'Neuron',
            'Neu-NRGN-II': 'Neuron',
Beispiel #30
0
def read_loom_to_anndata(ds_file: Path):
    """Reads a dataset in the loom format into the AnnData format."""

    adata = anndata.read_loom(ds_file)
    return adata