Ejemplo n.º 1
0
 def read_adata(self, path):
     path_lc = path.lower()
     if path_lc.endswith('.loom'):
         return anndata.read_loom(path)
     elif path_lc.endswith('.zarr'):
         return anndata.read_zarr(path)
     elif path_lc.endswith('.tsv'):
         return read_star_fusion_file(path)
     elif path_lc.endswith('.rds'):  # Seurat, convert to h5ad
         h5_file = path + '.h5ad'
         import os
         if not os.path.exists(h5_file) or abs(os.path.getmtime(h5_file) - os.path.getmtime(path)) > 0.00001:
             import subprocess
             import pkg_resources
             import shutil
             print('Converting Seurat object')
             if os.path.exists(h5_file):
                 os.remove(h5_file)
             subprocess.check_call(
                 ['Rscript', pkg_resources.resource_filename("cirrocumulus", 'seurat2h5ad.R'), path, h5_file])
             shutil.copystat(path, h5_file)
         adata = anndata.read(h5_file, backed=self.backed)
         if adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
             print('Using adata.raw')
             adata = anndata.AnnData(X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns)
         return adata
     return anndata.read(path, backed=self.backed)
Ejemplo n.º 2
0
def main():
    options = get_options()

    barcodes = {
        'CGTACTAG': 'tn5',
        'TCCTGAGC': 'tn5',
        'TCATGAGC': 'tn5',
        'CCTGAGAT': 'tn5',
        'TAAGGCGA': 'tnH',
        'GCTACGCT': 'tnH',
        'AGGCTCCG': 'tnH',
        'CTGCGCAT': 'tnH'
    }

    if options.barcodes:
        barcodes = {}
        for line in open(options.barcodes):
            t = line.split()
            barcodes[t[0]] = t[1]

    ad_l = dict.fromkeys(barcodes.values())
    for l in ad_l:
        bc_l = [x for x in barcodes if barcodes[x] == l]
        layer_files = [y for x in bc_l for y in options.input_files if x in y]
        ad_tmp = ad.read(layer_files[0])
        for f in layer_files[1:]:
            _X = ad.read(f)
            _X = _X[ad_tmp.obs_names]
            ad_tmp.X = ad_tmp.X + _X.X
        ad_l[l] = ad_tmp.copy()
    adata = ad_l[options.Xdata].copy()
    for l in ad_l:
        adata.layers[l] = ad_l[l].X

    adata.write(f'{options.sample_name}.h5ad')
Ejemplo n.º 3
0
def _load_saved_gimvi_files(
    dir_path: str,
    load_seq_adata: bool,
    load_spatial_adata: bool,
    prefix: Optional[str] = None,
    map_location: Optional[Literal["cpu", "cuda"]] = None,
) -> Tuple[dict, dict, np.ndarray, np.ndarray, dict, AnnData, AnnData]:
    file_name_prefix = prefix or ""
    seq_data_path = os.path.join(dir_path, f"{file_name_prefix}adata_seq.h5ad")
    spatial_data_path = os.path.join(dir_path,
                                     f"{file_name_prefix}adata_spatial.h5ad")

    adata_seq, adata_spatial = None, None
    if load_seq_adata and os.path.exists(seq_data_path):
        adata_seq = read(seq_data_path)
    elif load_seq_adata and not os.path.exists(seq_data_path):
        raise ValueError(
            "Save path contains no saved anndata and no adata was passed.")
    if load_spatial_adata and os.path.exists(spatial_data_path):
        adata_spatial = read(spatial_data_path)
    elif load_spatial_adata and not os.path.exists(spatial_data_path):
        raise ValueError(
            "Save path contains no saved anndata and no adata was passed.")

    use_legacy = _should_use_legacy_saved_gimvi_files(dir_path,
                                                      file_name_prefix)

    # TODO(jhong): Remove once legacy load is deprecated.
    if use_legacy:
        (
            model_state_dict,
            seq_var_names,
            spatial_var_names,
            attr_dict,
        ) = _load_legacy_saved_gimvi_files(dir_path, file_name_prefix,
                                           map_location)
    else:
        model_path = os.path.join(dir_path, f"{file_name_prefix}model.pt")

        model = torch.load(model_path, map_location=map_location)
        model_state_dict = model["model_state_dict"]
        seq_var_names = model["seq_var_names"]
        spatial_var_names = model["spatial_var_names"]
        attr_dict = model["attr_dict"]

    return (
        attr_dict,
        seq_var_names,
        spatial_var_names,
        model_state_dict,
        adata_seq,
        adata_spatial,
    )
Ejemplo n.º 4
0
def test_read_write_X(tmp_path, mtx_format, backed_mode, force_dense):
    base_pth = Path(tmp_path)
    orig_pth = base_pth / "orig.h5ad"
    backed_pth = base_pth / "backed.h5ad"

    orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr"))))
    orig.write(orig_pth)

    backed = ad.read(orig_pth, backed=backed_mode)
    backed.write(backed_pth, as_dense=["X"])
    backed.file.close()

    from_backed = ad.read(backed_pth)
    assert np.all(asarray(orig.X) == asarray(from_backed.X))
Ejemplo n.º 5
0
def test_readwrite_maintain_X_dtype(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, dtype="int8")
    adata_src.write(backing_h5ad)

    adata = ad.read(backing_h5ad)
    assert adata.X.dtype == adata_src.X.dtype
Ejemplo n.º 6
0
def test_readwrite_h5ad_one_dimensino(typ, backing_h5ad):
    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    adata_one = adata_src[:, 0].copy()
    adata_one.write(backing_h5ad)
    adata = ad.read(backing_h5ad)
    assert adata.shape == (3, 1)
Ejemplo n.º 7
0
def app_conf(request, tmpdir_factory):
    dataset_path = "test-data/pbmc3k_no_raw.h5ad"

    if not request.param:
        app = create_app()
        configure_app(app, [dataset_path], None, None)
        dataset_id = dataset_path
        os.environ[CIRRO_TEST] = "false"
    else:
        os.environ[CIRRO_TEST] = "true"
        os.environ[CIRRO_DB_URI] = "mongodb://localhost:27018/cirrocumulus-test"
        app = cached_app()
    with app.test_client() as client:
        if request.param:
            # insert dataset
            output_dir = str(tmpdir_factory.mktemp("data").join("test.zarr"))
            PrepareData(
                datasets=[anndata.read(dataset_path)],
                output=output_dir,
                output_format="zarr",
                no_auto_groups=True,
            ).execute()
            r = client.post("/api/dataset", data=dict(url=output_dir, name="test"))
            dataset_id = r.json["id"]
        yield client, dataset_id
Ejemplo n.º 8
0
def test_readwrite_sparse_as_dense(backing_h5ad):
    adata_src = ad.AnnData(X_sp)
    adata_src.write(backing_h5ad, force_dense=True)

    adata = ad.read(backing_h5ad, chunk_size=2)
    assert issparse(adata.X)
    assert np.allclose(X_sp.toarray(), adata.X.toarray())
Ejemplo n.º 9
0
def preprocess(anndatafile):
    ann = anndata.read(anndatafile)
    counts = ann.X
    genes = ann.var.index.astype("str")
    cells = ann.obs["unique_cell_id"].values.astype("str")

    important_genes = rnaseqTools.geneSelection(
        counts,
        n=1000,
        decay=1.5,
        genes=genes,
        plot=False,
    )

    librarySizes = np.sum(counts, axis=1)
    median = np.median(np.asarray(librarySizes).squeeze())
    X = np.log2(counts[:, important_genes] / librarySizes * median + 1)
    X = np.array(X)
    X = X - X.mean(axis=0)
    U, s, V = np.linalg.svd(X, full_matrices=False)
    U[:, np.sum(V, axis=1) < 0] *= -1
    X = np.dot(U, np.diag(s))
    X = X[:, np.argsort(s)[::-1]][:, :50]

    # map the group assignments to a color
    stage = ann.obs["TimeID"].map(lambda x: lbl_map[x]).values.astype("str")
    alt_colors = ann.obs["TissueName"].map(
        lambda x: tissue_map[x]).values.astype("str")
    return X, stage, alt_colors
Ejemplo n.º 10
0
def _load_saved_files(
    dir_path: str,
    load_adata: bool,
    prefix: Optional[str] = None,
    map_location: Optional[Literal["cpu", "cuda"]] = None,
) -> Tuple[dict, np.ndarray, dict, AnnData]:
    """Helper to load saved files."""
    file_name_prefix = prefix or ""
    adata_path = os.path.join(dir_path, f"{file_name_prefix}adata.h5ad")

    if os.path.exists(adata_path) and load_adata:
        adata = read(adata_path)
    elif not os.path.exists(adata_path) and load_adata:
        raise ValueError("Save path contains no saved anndata and no adata was passed.")
    else:
        adata = None

    use_legacy = _should_use_legacy_saved_files(dir_path, file_name_prefix)

    # TODO(jhong): Remove once legacy load is deprecated.
    if use_legacy:
        model_state_dict, var_names, attr_dict = _load_legacy_saved_files(
            dir_path, file_name_prefix, map_location
        )
    else:
        model_path = os.path.join(dir_path, f"{file_name_prefix}model.pt")

        model = torch.load(model_path, map_location=map_location)
        model_state_dict = model["model_state_dict"]
        var_names = model["var_names"]
        attr_dict = model["attr_dict"]

    return attr_dict, var_names, model_state_dict, adata
Ejemplo n.º 11
0
def _load_purified_pbmc_dataset(
    save_path: str = "data/",
    subset_datasets: List[str] = None,
) -> anndata.AnnData:
    url = "https://github.com/YosefLab/scVI-data/raw/master/PurifiedPBMCDataset.h5ad"
    save_fn = "PurifiedPBMCDataset.h5ad"
    _download(url, save_path, save_fn)
    path_to_file = os.path.join(save_path, save_fn)
    adata = anndata.read(path_to_file)

    dataset_names = [
        "cd4_t_helper",
        "regulatory_t",
        "naive_t",
        "memory_t",
        "cytotoxic_t",
        "naive_cytotoxic",
        "b_cells",
        "cd4_t_helper",
        "cd34",
        "cd56_nk",
        "cd14_monocytes",
    ]
    if subset_datasets is not None:
        row_indices = []
        for dataset in subset_datasets:
            assert dataset in dataset_names
            idx = np.where(adata.obs["cell_types"] == dataset)[0]
            row_indices.append(idx)
        row_indices = np.concatenate(row_indices)
        adata = adata[row_indices].copy()

    return adata
Ejemplo n.º 12
0
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith('.loom'):
        adata = anndata.read_loom(path)
    elif path.lower().endswith('.zarr'):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if 'module' in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns['module']['X'], var=adata.uns['module']['var'])
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[
            0]:
        logger.info('Using adata.raw')
        adata = anndata.AnnData(X=adata.raw.X,
                                var=adata.raw.var,
                                obs=adata.obs,
                                obsm=adata.obsm,
                                uns=adata.uns)

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info(
                'No spatial data found in {}'.format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(
                adata.obs[field]):
            logger.info('Converting {} to categorical'.format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype('category')
    return adata
Ejemplo n.º 13
0
    def __init__(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        macos = anndata.read(
            os.path.join(current_dir, "macosko_dropseq_control.h5ad"))

        super(MacosDataset, self).__init__(macos,
                                           select_genes_keywords=["ercc"])
Ejemplo n.º 14
0
    def __init__(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        klein = anndata.read(
            os.path.join(current_dir, "klein_indrops_control_GSM1599501.h5ad"))

        super(KleinDataset, self).__init__(klein,
                                           select_genes_keywords=["ercc"])
Ejemplo n.º 15
0
    def __init__(self, n_genes=100):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        svens = anndata.read(
            os.path.join(current_dir, "svensson_chromium_control.h5ad"))

        sven2 = svens[svens.obs.query('sample == "20312"').index]
        super(Sven2DatasetRNA, self).__init__(sven2, n_genes=n_genes)
Ejemplo n.º 16
0
    def __init__(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        zheng = anndata.read(
            os.path.join(current_dir, "zheng_gemcode_control.h5ad"))

        super(ZhengDataset, self).__init__(zheng,
                                           select_genes_keywords=["ercc"])
Ejemplo n.º 17
0
    def __init__(self, data, select_genes_keywords=[]):

        super().__init__()

        if isinstance(data, str):
            anndataset = anndata.read(data)
        else:
            anndataset = data

        idx_and_gene_names = [
            (idx, gene_name)
            for idx, gene_name in enumerate(list(anndataset.var.index))
        ]
        for keyword in select_genes_keywords:
            idx_and_gene_names = [(idx, gene_name)
                                  for idx, gene_name in idx_and_gene_names
                                  if keyword.lower() in gene_name.lower()]

        gene_indices = np.array([idx for idx, _ in idx_and_gene_names])
        gene_names = np.array(
            [gene_name for _, gene_name in idx_and_gene_names])

        expression_mat = np.array(anndataset.X[:, gene_indices].todense())

        select_cells = expression_mat.sum(axis=1) > 0
        expression_mat = expression_mat[select_cells, :]

        select_genes = (expression_mat > 0).mean(axis=0) > 0.21
        gene_names = gene_names[select_genes]
        expression_mat = expression_mat[:, select_genes]

        print("Final dataset shape :", expression_mat.shape)

        self.populate_from_data(X=expression_mat, gene_names=gene_names)
Ejemplo n.º 18
0
def make_partial_results_filenames(wildcards):
#     print('💙💙💙💙')
    STACKED_H5AD=f'stacked_h5ads/{wildcards.dataset_project_id}-{wildcards.dataset_sample_id}-stacked.h5ad'
    adata=anndata.read(STACKED_H5AD)
    total_cells = adata.n_obs
    n_retained_cells = int(0.85*total_cells)
    print('Total cells:', total_cells)
    print('Retained cells:', n_retained_cells)

    cells_sizes = []
    #initial number of sampled cells
    sampling_size = 500
    print('💙💙💙💙')
    print('total cells:', total_cells)
    print(sampling_size)
    print(n_retained_cells)
    print( '🔺🔺🔺')
    while sampling_size < n_retained_cells:
        cells_sizes.append(sampling_size)
        sampling_size = int(sampling_size*np.sqrt(2))
        
    ss_depths = [depth for depth in adata.layers.keys()]
    print('🥖🥖🥖🥖🥖🥖🥖')
    print(dataset_project_id)
    print(dataset_sample_id)
    print(cells_sizes)
    print(ss_depths)
    print('🥖🥖🥖🥖🥖🥖🥖')
        
    return expand(
    'scvi_output/partial_csvs/{{dataset_project_id}}-{{dataset_sample_id}}-c{ss_cells}-d{ss_depth}-SUCCESS.csv',
    ss_cells=cells_sizes, ss_depth=ss_depths    
    )
Ejemplo n.º 19
0
def test_raw(backing_h5ad):
    X = np.array(X_list)
    adata = ad.AnnData(X,
                       obs=obs_dict,
                       var=var_dict,
                       uns=uns_dict,
                       dtype='int32')

    # init raw
    adata.raw = adata

    assert adata.raw[:, 0].X.tolist() == [1, 4, 7]

    adata = adata[:, [0, 1]]

    assert adata.var_names.tolist() == ['var1', 'var2']
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']

    # read write
    adata.write(backing_h5ad)
    adata = ad.read(backing_h5ad)

    assert adata.raw[:, 0].X.tolist() == [1, 4, 7]
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']
    assert adata.var_names.tolist() == ['var1', 'var2']
Ejemplo n.º 20
0
def test_raw(backing_h5ad):
    X = np.array(X_list)
    adata = ad.AnnData(X,
                       obs=obs_dict,
                       var=var_dict,
                       uns=uns_dict,
                       dtype='int32')

    # init raw
    adata.raw = adata

    assert adata.raw[:, 0].X.tolist() == [[1], [4], [7]]

    adata = adata[:, [0, 1]]

    assert adata.var_names.tolist() == ['var1', 'var2']
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']

    # read write
    with pytest.warns(ImplicitModificationWarning,
                      match="Initializing view as actual"):
        # TODO: don’t modify adata just to write it
        adata.write(backing_h5ad)
    adata = ad.read(backing_h5ad)

    assert adata.raw[:, 0].X.tolist() == [[1], [4], [7]]
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']
    assert adata.var_names.tolist() == ['var1', 'var2']
Ejemplo n.º 21
0
def test_readwrite_h5ad(typ, dataset_kwargs, backing_h5ad):
    tmpdir = tempfile.TemporaryDirectory()
    tmpdirpth = Path(tmpdir.name)
    mid_pth = tmpdirpth / "mid.h5ad"

    X = typ(X_list)
    adata_src = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict)
    assert not is_categorical_dtype(adata_src.obs["oanno1"])
    adata_src.raw = adata_src
    adata_src.write(backing_h5ad, **dataset_kwargs)

    adata_mid = ad.read(backing_h5ad)
    adata_mid.write(mid_pth, **dataset_kwargs)

    adata = ad.read_h5ad(mid_pth)
    assert is_categorical_dtype(adata.obs["oanno1"])
    assert not is_categorical_dtype(adata.obs["oanno2"])
    assert adata.obs.index.tolist() == ["name1", "name2", "name3"]
    assert adata.obs["oanno1"].cat.categories.tolist() == ["cat1", "cat2"]
    assert is_categorical_dtype(adata.raw.var["vanno2"])
    assert np.all(adata.obs == adata_src.obs)
    assert np.all(adata.var == adata_src.var)
    assert np.all(adata.var.index == adata_src.var.index)
    assert adata.var.index.dtype == adata_src.var.index.dtype
    assert type(adata.raw.X) is type(adata_src.raw.X)
    assert type(adata.raw.varm) is type(adata_src.raw.varm)
    assert np.allclose(asarray(adata.raw.X), asarray(adata_src.raw.X))
    assert np.all(adata.raw.var == adata_src.raw.var)
    assert isinstance(adata.uns["uns4"]["a"], (int, np.integer))
    assert isinstance(adata_src.uns["uns4"]["a"], (int, np.integer))
    assert type(adata.uns["uns4"]["c"]) is type(adata_src.uns["uns4"]["c"])
    assert_equal(adata, adata_src)
Ejemplo n.º 22
0
def _load_saved_files(
    dir_path: str,
    load_adata: bool,
    map_location: Optional[Literal["cpu", "cuda"]] = None,
):
    """Helper to load saved files."""
    setup_dict_path = os.path.join(dir_path, "attr.pkl")
    adata_path = os.path.join(dir_path, "adata.h5ad")
    varnames_path = os.path.join(dir_path, "var_names.csv")
    model_path = os.path.join(dir_path, "model_params.pt")

    if os.path.exists(adata_path) and load_adata:
        adata = read(adata_path)
    elif not os.path.exists(adata_path) and load_adata:
        raise ValueError(
            "Save path contains no saved anndata and no adata was passed.")
    else:
        adata = None

    var_names = np.genfromtxt(varnames_path, delimiter=",", dtype=str)

    with open(setup_dict_path, "rb") as handle:
        attr_dict = pickle.load(handle)

    scvi_setup_dict = attr_dict.pop("scvi_setup_dict_")

    model_state_dict = torch.load(model_path, map_location=map_location)

    return scvi_setup_dict, attr_dict, var_names, model_state_dict, adata
def read_adata(path, spatial_directory=None, use_raw=False):
    if path.lower().endswith(".loom"):
        adata = anndata.read_loom(path)
    elif path.lower().endswith(".zarr"):
        adata = anndata.read_zarr(path)
    else:
        adata = anndata.read(path)
    if "module" in adata.uns:
        adata.uns[ADATA_MODULE_UNS_KEY] = anndata.AnnData(
            X=adata.uns["module"]["X"], var=adata.uns["module"]["var"]
        )
    if use_raw and adata.raw is not None and adata.shape[0] == adata.raw.shape[0]:
        logger.info("Using adata.raw")
        adata = anndata.AnnData(
            X=adata.raw.X, var=adata.raw.var, obs=adata.obs, obsm=adata.obsm, uns=adata.uns
        )

    if spatial_directory is not None:
        if not add_spatial(adata, spatial_directory):
            logger.info("No spatial data found in {}".format(spatial_directory))

    for field in categorical_fields_convert:
        if field in adata.obs and not pd.api.types.is_categorical_dtype(adata.obs[field]):
            logger.info("Converting {} to categorical".format(field))
            adata.obs[field] = adata.obs[field].astype(str).astype("category")
    return adata
Ejemplo n.º 24
0
Archivo: io.py Proyecto: michalk8/wot
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords):
    """
    Read h5ad, loom, mtx, 10X h5, and csv formatted files

    Parameters
    ----------
    path: str
        File name of data file.
    obs: {str, pd.DataFrame}
        Path to obs data file or a data frame
    var: {str, pd.DataFrame}
        Path to var data file or a data frame
    obs_filter {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    var_filter: {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    Returns
    -------
    Annotated data matrix.
    """

    _, ext = os.path.splitext(str(path).lower())
    if ext == '.txt':
        df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0)
        adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    elif ext == '.h5ad':
        adata = anndata.read(path)
    elif ext == '.loom':
        adata = anndata.read_loom(path)
    elif ext == '.mtx':
        adata = anndata.read_mtx(path)
    elif ext == '.zarr':
        adata = anndata.read_zarr(path)
    else:
        raise ValueError('Unknown file format: {}'.format(ext))

    def get_df(meta):
        if not isinstance(meta, pd.DataFrame):
            tmp_path = None
            if meta.startswith('gs://'):
                tmp_path = download_gs_url(meta)
                meta = tmp_path
            meta = pd.read_csv(meta, sep=None, index_col='id', engine='python')
            if tmp_path is not None:
                os.remove(tmp_path)
        return meta

    if obs is not None:
        if not isinstance(obs, list) and not isinstance(obs, tuple):
            obs = [obs]
        for item in obs:
            adata.obs = adata.obs.join(get_df(item))
    if var is not None:
        if not isinstance(var, list) and not isinstance(var, tuple):
            var = [var]
        for item in var:
            adata.var = adata.var.join(get_df(item))

    return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
Ejemplo n.º 25
0
    def __init__(self):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        svens = anndata.read(
            os.path.join(current_dir, "svensson_chromium_control.h5ad"))

        sven2 = svens[svens.obs.query('sample == "20312"').index]
        super(Sven2Dataset, self).__init__(sven2,
                                           select_genes_keywords=["ercc"])
Ejemplo n.º 26
0
    def __init__(self, n_rna=100, threshold=0.01):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        klein = anndata.read(
            os.path.join(current_dir, 'klein_indrops_control_GSM1599501.h5ad'))

        super(KleinNegativeControlDataset, self).__init__(klein,
                                                          n_rna=n_rna,
                                                          threshold=threshold)
Ejemplo n.º 27
0
    def __init__(self, n_rna=100, threshold=0.01):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        svens = anndata.read(
            os.path.join(current_dir, 'svensson_chromium_control.h5ad'))

        sven2 = svens[svens.obs.query('sample == "20312"').index]
        super(Svensson2NegativeControlDataset,
              self).__init__(sven2, n_rna=n_rna, threshold=threshold)
Ejemplo n.º 28
0
    def __init__(self, n_rna=100, threshold=0.01):
        current_dir = os.path.dirname(os.path.realpath(__file__))
        zheng = anndata.read(
            os.path.join(current_dir, 'zheng_gemcode_control.h5ad'))

        super(ZhengNegativeControlDataset, self).__init__(zheng,
                                                          n_rna=n_rna,
                                                          threshold=threshold)
Ejemplo n.º 29
0
def test_raw_rw(adata_raw, backing_h5ad):
    adata_raw.write(backing_h5ad)
    adata_read = ad.read(backing_h5ad)

    assert_equal(adata_read, adata_raw, exact=True)

    assert adata_raw.var_names.tolist() == ["var1", "var2"]
    assert adata_raw.raw.var_names.tolist() == ["var1", "var2", "var3"]
    assert adata_raw.raw[:, 0].X.tolist() == [[1], [4], [7]]
Ejemplo n.º 30
0
def main():
    usage = 'hashsolo'
    parser = ArgumentParser(usage,
                            formatter_class=ArgumentDefaultsHelpFormatter)

    parser.add_argument(dest='data_file',
                        help='h5ad file containing cell hashing counts')
    parser.add_argument('-j',
                        dest='model_json_file',
                        default=None,
                        help='json file to pass optional arguments')
    parser.add_argument('-o',
                        dest='out_dir',
                        default='hashsolo_output',
                        help='Output directory for results')
    parser.add_argument('-p',
                        dest='pre_existing_clusters',
                        default=None,
                        help='column in cell_hashing_data_file.obs to \
                        specifying different cell types or clusters')
    parser.add_argument('-q',
                        dest='plot_name',
                        default='hashing_qc_plots.pdf',
                        help='name of plot to output')
    parser.add_argument('-n',
                        dest='number_of_noise_barcodes',
                        default=None,
                        help='Number of barcodes to use to create noise \
                        distribution')

    args = parser.parse_args()

    model_json_file = args.model_json_file
    if model_json_file is not None:
        # read parameters
        with open(model_json_file) as model_json_open:
            params = json.load(model_json_open)
    else:
        params = {}
    data_file = args.data_file
    data_ext = os.path.splitext(data_file)[-1]
    if data_ext == '.h5ad':
        cell_hashing_adata = anndata.read(data_file)
    else:
        print('Unrecognized file format')

    if not os.path.isdir(args.out_dir):
        os.mkdir(args.out_dir)

    hashsolo(cell_hashing_adata,
             pre_existing_clusters=args.pre_existing_clusters,
             number_of_noise_barcodes=args.number_of_noise_barcodes,
             **params)
    cell_hashing_adata.write(os.path.join(args.out_dir, 'hashsoloed.h5ad'))
    plot_qc_checks_cell_hashing(cell_hashing_adata,
                                fig_path=os.path.join(args.out_dir,
                                                      args.plot_name))
Ejemplo n.º 31
0
def test_raw():
    X = np.array(X_list)
    adata = ad.AnnData(X, obs=obs_dict, var=var_dict, uns=uns_dict, dtype='int32')

    # init raw
    adata.raw = adata

    assert adata.raw[:, 0].X.tolist() == [1, 4, 7]

    adata = adata[:, [0, 1]]

    assert adata.var_names.tolist() == ['var1', 'var2']
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']

    # read write
    adata.write('./test.h5ad')
    adata = ad.read('./test.h5ad')

    assert adata.raw[:, 0].X.tolist() == [1, 4, 7]
    assert adata.raw.var_names.tolist() == ['var1', 'var2', 'var3']
    assert adata.var_names.tolist() == ['var1', 'var2']