Ejemplo n.º 1
0
def test_read_10x_mtx():
    sc.read_10x_mtx(os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices',
                                 'hg19_chr21'),
                    var_names='gene_symbols',
                    cache=True)
    sc.read_10x_mtx(os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix'),
                    var_names='gene_symbols',
                    cache=True)
Ejemplo n.º 2
0
    def _read_raw_dataset(self):
#         sars_data_path = next(self._data_path_sars.glob('filtered_feature_bc_matrix'))
#         mock_data_path = next(self._data_path_mock.glob('filtered_feature_bc_matrix'))

#         sars_dataset = sc.read_10x_mtx(sars_data_path, cache=True)
#         mock_dataset = sc.read_10x_mtx(mock_data_path, cache=True)
#         dataset = sars_dataset.concatenate(mock_dataset, batch_categories=['SARS2', 'MOCK'], index_unique=None)       
        
        data_files = glob.glob(str(self._data_path / 'data/*/filtered_feature_bc_matrix'))
        sample_suffix = '_MON_crispr'

        # first load
        adatas = {}
        for i, file in enumerate(data_files):
            if i==0:
                adata = sc.read_10x_mtx(file, cache=True)
                batch_key = file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0]
                adata.var_names_make_unique()
            else:
                adatas[file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0]] = sc.read_10x_mtx(file, cache=True)
                adatas[file.split('/filtered_')[0].split('/')[-1].split(sample_suffix)[0]].var_names_make_unique()

        adata = adata.concatenate(*adatas.values(), batch_categories=[batch_key]+list(adatas.keys()), index_unique=None)
        del adatas
        # drop sars-cov-2 counts
        adata = adata[:, :-1]
        # drop suffixes from index
        adata.obs.index = adata.obs.index.str.replace('-SARS2', '').str.replace('-MOCK', '')
        # drop duplicate index
        adata = adata[~adata.obs.index.duplicated(keep=False)]
        return adata
Ejemplo n.º 3
0
def join_SLAM(bdata, letter):
    for time in ['old', 'new']:
        adata = sc.read_10x_mtx('/fast/scratch/groups/ag_bluethgen/count' +
                                letter + '/' + time + '_matrix',
                                make_unique=False)
        adata.var_names_make_unique()

        # intersect vars
        bdata = bdata[:, np.isin(bdata.var_names, adata.var_names)].copy()
        adata = adata[:, np.isin(adata.var_names, bdata.var_names)].copy()

        # intersect obs (filtered)
        new_index = [index[:-2]
                     for index in adata.obs_names]  # clean index names
        adata.obs_names = new_index
        adata = adata[np.isin(adata.obs_names, bdata.obs_names), :].copy()

        # align
        adata = adata[:, np.argsort(adata.var_names)].copy()
        bdata = bdata[:, np.argsort(bdata.var_names)].copy()

        adata = adata[np.argsort(adata.obs_names), :].copy()
        bdata = bdata[np.argsort(bdata.obs_names), :].copy()

        print('adata ', adata.n_obs, adata.n_obs)
        print('bdata ', bdata.n_obs, bdata.n_obs)
        # add
        bdata.layers[time] = adata.X

    # overwrite u and s, then redo pool, then normalize, but keep umap. No filtering
    bdata.layers['unspliced'] = bdata.layers['new']
    bdata.layers['spliced'] = bdata.layers['old']
    scv.pp.neighbors(bdata)
    scv.pp.moments(bdata)  # pool, normalize
    return bdata
Ejemplo n.º 4
0
def read_10x(input_10x_h5,
             input_10x_mtx,
             genome='hg19',
             var_names='gene_symbols',
             extra_obs=None,
             extra_var=None):
    """
    Wrapper function for sc.read_10x_h5() and sc.read_10x_mtx(), mainly to
    support adding extra metadata
    """
    if input_10x_h5 is not None:
        adata = sc.read_10x_h5(input_10x_h5, genome=genome)
    elif input_10x_mtx is not None:
        adata = sc.read_10x_mtx(input_10x_mtx, var_names=var_names)

    if extra_obs:
        obs_tbl = pd.read_csv(extra_obs, sep='\t', header=0, index_col=0)
        adata.obs = adata.obs.merge(
            obs_tbl,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )

    if extra_var:
        var_tbl = pd.read_csv(extra_var, sep='\t', header=0, index_col=0)
        adata.var = adata.var.merge(
            var_tbl,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )
    return adata
Ejemplo n.º 5
0
def load_ds(path):
    """
    H5 files are named like this GSM4698176_Sample_1_filtered_feature_bc_matrix.h5

    Fastcar outputs are in …fastcar/{sample}/matrix.mtx.gz
    """
    fname = os.path.basename(path)
    if os.path.exists(os.path.join(path, "matrix.mtx.gz")):
        sample = fname
        ds = sc.read_10x_mtx(path)
    elif fname.endwith(".h5"):
        sample = "_".join(fname.split("_")[1:3])
        ds = sc.read_10x_h5(path)
    else:
        raise ValueError(f"Unknown input path {path}")
    ds.var_names = rename_genes(ds.var_names)
    ds.var_names_make_unique(join=".")
    ds.obs["orig.ident"] = sample
    ds.obs_names = sample + "_" + ds.obs_names.str.replace("-\d$", "")
    sc.pp.filter_cells(ds, min_genes=200)
    sc.pp.filter_genes(ds, min_cells=3)

    meta = SAMPLES.loc[SAMPLES.Sample == sample, :]
    ds.obs["Patient"] = meta.Patient.values[0]
    ds.obs["Day of intubation"] = meta["Day of intubation"].values[0]
    ds.obs["COVID-19"] = meta["COVID-19"].values[0]
    ds.obs["Sample"] = sample
    return ds
Ejemplo n.º 6
0
def test_recipe():
    try:
        adata = sc.datasets.pbmc3k()
    except:
        fname = 'pbmc3k_filtered_gene_bc_matrices.tar.gz'
        url = 'http://cf.10xgenomics.com/samples/cell-exp/1.1.0/pbmc3k/' + fname
        r = requests.get(url, stream=True)
        if r.status_code == 200:
            with open('filtered_gene_bc_matrices.tar.gz', 'wb') as f:
                f.write(r.raw.read())
        os.system('tar -xzvf filtered_gene_bc_matrices.tar.gz')
        adata = sc.read_10x_mtx('filtered_gene_bc_matrices/hg19')
    _adata = adata.copy()
    ddl.pp.external.recipe_scanpy_qc(_adata)
    assert not _adata.obs['filter_rna'].empty
    _adata = adata.copy()
    ddl.pp.external.recipe_scanpy_qc(
        _adata, mito_cutoff=None)  # weird segmentation fault in the tests
    assert not _adata.obs['gmm_pct_count_clusters_keep'].empty
    _adata = adata.copy()
    ddl.pp.external.recipe_scanpy_qc(_adata, min_counts=100, max_counts=20000)
    _adata = adata.copy()
    ddl.pp.external.recipe_scanpy_qc(_adata, min_counts=100)
    _adata = adata.copy()
    ddl.pp.external.recipe_scanpy_qc(_adata, max_counts=20000)
Ejemplo n.º 7
0
def read_file(filename, transpose=False):
    adata = None
    if os.path.exists(filename):
        if os.path.isdir(filename):
            adata = sc.read_10x_mtx(filename)

        elif os.path.isfile(filename):
            name, filetype = os.path.splitext(filename)
            if filetype == ".txt":
                print()
                adata = sc.read_text(filename)

            if filetype == ".csv":
                adata = sc.read_csv(filename)

            if filetype == ".h5ad":
                adata = sc.read(filename)

        else:
            print(
                "ERROR: the format must be [H5AD|CSV|TXT] for file or 10x-MTX for directory."
            )
            sys.exit()

        if transpose:
            adata = adata.transpose()
    elif not os.path.exists(filename):
        sys.exit("ERROR: no such file or directory.")

    if not isinstance(adata.X, np.ndarray):
        X = adata.X.toarray()
        adata = anndata.AnnData(X, obs=adata.obs, var=adata.var)
    return adata
Ejemplo n.º 8
0
    def load_data(data):
        if isfile(data):
            name, extension = splitext(data)
            if extension == ".h5ad":
                adata = sc.read_h5ad(data)
            elif extension == ".loom":
                adata = sc.read_loom(data)
            else:
                raise click.FileError(data, hint="does not have a valid extension [.h5ad | .loom]")
        elif isdir(data):
            if not data.endswith(sep):
                data += sep
            adata = sc.read_10x_mtx(data)
        else:
            raise click.FileError(data, hint="not a valid file or path")

        if not set_obs_names == "":
            if set_obs_names not in adata.obs_keys():
                raise click.UsageError(f"obs {set_obs_names} not found, options are: {adata.obs_keys()}")
            adata.obs_names = adata.obs[set_obs_names]
        if not set_var_names == "":
            if set_var_names not in adata.var_keys():
                raise click.UsageError(f"var {set_var_names} not found, options are: {adata.var_keys()}")
            adata.var_names = adata.var[set_var_names]
        if make_obs_names_unique:
            adata.obs.index = make_index_unique(adata.obs.index)
        if make_var_names_unique:
            adata.var.index = make_index_unique(adata.var.index)
        if not adata._obs.index.is_unique:
            click.echo("Warning: obs index is not unique")
        if not adata._var.index.is_unique:
            click.echo("Warning: var index is not unique")
        return adata
Ejemplo n.º 9
0
def read_10x_data(input_file,
                  format_type='10x_h5',
                  backed=None,
                  transpose=False,
                  sparse=False):
    if format_type == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif format_type == '10x_mtx':
        adata = sc.read_10x_mtx(input_file)
    elif format_type == '10x_h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif format_type == "10x_csv":
        adata = sc.read_csv(input_file)
    elif format_type == "10x_txt":
        adata = sc.read_csv(input_file, delimiter="\t")
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')

    if transpose:
        adata = adata.transpose()
    if sparse:
        adata.X = csr_matrix(adata.X, dtype='float32')
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    return adata
Ejemplo n.º 10
0
def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
    if prefix is not None:
        # Build files named "prefix_XXX.xxx" in a temporary directory.
        mtx_path_orig = mtx_path
        mtx_path = tmp_path / "filtered_gene_bc_matrices_prefix"
        mtx_path.mkdir()
        for item in mtx_path_orig.iterdir():
            if item.is_file():
                shutil.copyfile(item, mtx_path / f"{prefix}{item.name}")

    mtx = sc.read_10x_mtx(mtx_path, var_names="gene_symbols", prefix=prefix)
    h5 = sc.read_10x_h5(h5_path)

    # Drop genome column for comparing v3
    if "3.0.0" in str(h5_path):
        h5.var.drop(columns="genome", inplace=True)

    # Check equivalence
    assert_anndata_equal(mtx, h5)

    # Test that it can be written:
    from_mtx_pth = tmp_path / "from_mtx.h5ad"
    from_h5_pth = tmp_path / "from_h5.h5ad"

    mtx.write(from_mtx_pth)
    h5.write(from_h5_pth)

    assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
Ejemplo n.º 11
0
def read_cellranger(fn, args, rm_zero_cells=True, add_sample_id=True, **kw):
    """read cellranger results

    Assumes the Sample_ID may be extracted from cellranger output dirname, 
    e.g ` ... /Sample_ID/outs/filtered_feature_bc_matrix.h5 `
    """
    if fn.endswith('.h5'):
        dirname = os.path.dirname(fn)
        data = sc.read_10x_h5(fn)
        data.var['gene_symbols'] = list(data.var_names)
        data.var_names = list(data.var['gene_ids'])
    else:
        mtx_dir = os.path.dirname(fn)
        dirname = os.path.dirname(mtx_dir)
        data = sc.read_10x_mtx(mtx_dir, gex_only=args.gex_only, var_names='gene_ids')
        data.var['gene_ids'] = list(data.var_names)
    
        
    if add_sample_id:
        barcodes = [b.split('-')[0] for b in data.obs.index]
        if len(barcodes) == len(set(barcodes)):
            data.obs_names = barcodes
        sample_id = os.path.basename(os.path.dirname(dirname))
        data.obs['sample_id'] = sample_id
        data.obs['sample_id'] = data.obs['sample_id'].astype('category')
        data.obs_names = [i + '-' + sample_id for i in data.obs_names]
        
    return data
Ejemplo n.º 12
0
def read_adata(
        gex_data, # filename
        gex_data_type # string describing file type
):
    ''' Split this out so that other code can use it. Read GEX data
    '''
    print('reading:', gex_data, 'of type', gex_data_type)
    if gex_data_type == 'h5ad':
        adata = sc.read_h5ad( gex_data )

    elif gex_data_type == '10x_mtx':
        adata = sc.read_10x_mtx( gex_data )

    elif gex_data_type == '10x_h5':
        adata = sc.read_10x_h5( gex_data, gex_only=True )

    elif gex_data_type == 'loom':
        adata = sc.read_loom( gex_data )

    else:
        print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']")
        exit()

    if adata.isview: # this is so weird
        adata = adata.copy()
    return adata
Ejemplo n.º 13
0
def make_raw_dataset(samples, path, name):
    """
    Function to load, preprocess and concatenate a dataset from multiple RNAseq
     samples
    Inputs:
     samples, dictionary of sample file prefixes as keys and timepoint metadata
      as values
     path, path to directory containing sample files
     name, dataset name for labeling AnnData object metadata
    Output: AnnData object of concatenated samples, annotated with dataset,
     timepoint, and sample id labels
    """
    anndata_dict = {}

    for sm in samples.keys():
        print(sm)

        # read in data from GEO file
        data = sc.read_10x_mtx(path, prefix=sm, cache=True)

        # add metadata information
        data.obs['dataset'] = name
        data.obs['timepoint'] = samples[sm]

        # add to dict for concatenation
        anndata_dict[sm] = data

    # concatenate samples
    data_full = ad.concat(anndata_dict,
                          join='outer',
                          label='sample id',
                          index_unique='_',
                          fill_value=0.0)
    return data_full
Ejemplo n.º 14
0
def read_10x(file):
    # read 10x data
    # verbosity: errors (0), warnings (1), info (2), hints (3)
    sc.settings.verbosity = 3
    sc.logging.print_header()
    sc.settings.set_figure_params(dpi=80, facecolor='white')

    # the file that will store the analysis results
    results_file = 'write/pbmc3k.h5ad'

    # os.chdir(file)
    dir_name = ''
    path10x = file
    for i in os.listdir(file):
        if (i[:6] == 'filter'):
            dir_name = i
            if (os.listdir(file + '/' + dir_name)[0] == 'hg19'):
                path10x = file + '/' + dir_name + '/hg19/'  # the directory with the `.mtx` file
            else:
                path10x = file + '/' + dir_name + '/'  # the directory with the `.mtx` file

    adata = sc.read_10x_mtx(
        path10x,
        # use gene symbols for the variable names (variables-axis index)
        var_names='gene_symbols',
        cache=True)
    adata.var_names_make_unique()
    # os.chdir('../../..')
    adata.uns['dataset'] = 'data10x'
    return AnnData(adata)
Ejemplo n.º 15
0
def read_data(inp_path, out_path):
    global adata
    adata = sc.read_10x_mtx(
        inp_path,  # the directory with the `.mtx` file
        var_names=
        'gene_symbols',  # use gene symbols for the variable names (variables-axis index)
        cache=True)
Ejemplo n.º 16
0
def open_10_genomics_data(dir):
    """
        Open a 10x genomics data into an annadata object.
    """
    adata = sc.read_10x_mtx(dir, var_names='gene_symbols', cache=True)
    adata.var_names_make_unique()
    return adata
Ejemplo n.º 17
0
def test_read_10x_v1():
    v1_mtx = sc.read_10x_mtx(os.path.join(ROOT, '1.2.0',
                                          'filtered_gene_bc_matrices',
                                          'hg19_chr21'),
                             var_names='gene_symbols')
    v1_h5 = sc.read_10x_h5(
        os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'))
    assert_anndata_equal(v1_mtx, v1_h5)
Ejemplo n.º 18
0
def load_data(path, dtype='dge'):
    if dtype == 'dge':
        dataset = ad.read_text(path)

    elif dtype == '10x':
        dataset = sc.read_10x_mtx(path, var_names='gene_symbols', cache=True)

    return dataset
Ejemplo n.º 19
0
def test_read_10x_v3():
    v3_mtx = sc.read_10x_mtx(
        ROOT / '3.0.0' / 'filtered_feature_bc_matrix',
        var_names='gene_symbols',
    )
    v3_h5 = sc.read_10x_h5(ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5')
    v3_h5.var.drop(columns="genome", inplace=True)
    assert_anndata_equal(v3_mtx, v3_h5)
Ejemplo n.º 20
0
def get_adata(matrix_path):
    """
    Convert 10x Feature-Barcode Matrix file to AnnData
    :param str matrix_path: path to 10x feature-barcode matrix file
    :return anndata: Anndata object
    """
    sc.settings.verbosity = 3
    adata = sc.read_10x_mtx(matrix_path, var_names='gene_symbols', cache=True)
    adata.var_names_make_unique()
    return adata
Ejemplo n.º 21
0
    def from_matrix_mtx(cls, path_to_matrix):
        """
        Factory function for reading gene expression matrix from mtx formatted
        10x output.
        """

        anndata_matrix = scanpy.read_10x_mtx(path_to_matrix, gex_only=True)
        anndata_matrix.var_names_make_unique()

        return cls(anndata_matrix)
Ejemplo n.º 22
0
def get_file(source, accession, matrixFile):
    matrix = zipfile.ZipFile(matrixFile.file, 'r')
    #matrix = zipfile.ZipFile(matrixFile, 'r')

    matrix.extractall("/tmp")
    filePath = "/tmp/" + source + "/" + accession
    adata = sc.read_10x_mtx(filePath, var_names="gene_symbols", cache=True)
    #shutil.rmtree(filePath)

    return adata
Ejemplo n.º 23
0
def read_10x_mtx(filename: PathLike,
                 atac_only: bool = True,
                 *args,
                 **kwargs) -> AnnData:
    adata = sc.read_10x_mtx(filename, gex_only=False, *args, **kwargs)
    if atac_only:
        adata = adata[:,
                      list(
                          map(lambda x: x == "Peaks", adata.
                              var["feature_types"]))]
    return adata
Ejemplo n.º 24
0
Archivo: io.py Proyecto: gtca/muon
def read_10x_mtx(filename: PathLike,
                 prot_only: bool = True,
                 *args,
                 **kwargs) -> AnnData:
    adata = sc.read_10x_mtx(filename, gex_only=False, *args, **kwargs)
    if prot_only:
        adata = adata[:,
                      list(
                          map(lambda x: x == "Antibody Capture", adata.
                              var["feature_types"]))].copy()
    return adata
Ejemplo n.º 25
0
def test_pbmc_cite(save_path):
    file_path = os.path.join(
        save_path, "10X/pbmc_10k_protein_v3/filtered_feature_bc_matrix.tar.gz")
    sp = os.path.join(save_path, "10X/pbmc_10k_protein_v3/")
    tar = tarfile.open(file_path, "r:gz")
    tar.extractall(path=sp)
    tar.close()
    dataset = sc.read_10x_mtx(os.path.join(sp, "filtered_feature_bc_matrix"),
                              gex_only=False)
    organize_cite_seq_10x(dataset)
    unsupervised_training_one_epoch(dataset)
def generate_adata_from_10X(session_ID, data_type="10X_mtx"):
    data_dir = save_analysis_path + str(session_ID) + "/raw_data/"
    if (data_type == "10X_mtx"):
        adata = sc.read_10x_mtx(data_dir, cache=False)
    elif (data_type == "10X_h5"):
        adata = sc.read_10x_h5(data_dir + "data.h5ad")
    else:
        print("[ERROR] data type not recognized - returning None")
        return None

    cache_adata(session_ID, adata)
    return adata
Ejemplo n.º 27
0
def load3k(cells: 'mito all seurat' = 'mito',
           subsample=.15,
           seed=None) -> 'anndata object':
    adata = sc.read_10x_mtx('../data/3k/hg19/',
                            var_names='gene_symbols',
                            cache=True)
    adata.obs['labels'] = loadlabels(load("../data/3k/pbmc.3k.labels"),
                                     load("../data/3k/hg19/barcodes.tsv"))

    adata = filter(adata, cells)
    adata = do_subsample(adata, subsample, seed)
    return adata
def load_anndata_from_input_and_output(
        input_dir_10x: str,
        cellbender_h5: str,
        analyzed_barcodes_only: bool = True) -> 'anndata.AnnData':
    """Load remove-background output count matrix into an anndata object.

    Args:
        input_dir: Raw 10x dir.
        output_file: Output h5 file created by remove-background (can be
            filtered or not).
        analyzed_barcodes_only: Argument passed to anndata_from_h5().
            False to load all barcodes, so that the size of
            the AnnData object will match the size of the input raw count
            matrix. True to load a limited set of barcodes: only those
            analyzed by the algorithm. This allows relevant latent variables
            to be loaded properly into adata.obs and adata.obsm, rather than
            adata.uns.

    Return:
        adata_out: AnnData object with counts before and after
            remove-background, as well as inferred latent variables from
            remove-background.
    """
    # Load input data.
    adata_10x = sc.read_10x_mtx(
        path=input_dir_10x,
        # var_names='gene_symbols',
        var_names='gene_ids',
        make_unique=False)

    # Load remove-background output data.
    # We need to do this because of bug here:
    # https://github.com/broadinstitute/CellBender/issues/57
    adata_out = anndata_from_h5(cellbender_h5,
                                analyzed_barcodes_only=analyzed_barcodes_only)

    # Subset the raw dataset to the relevant barcodes.
    adata_10x = adata_10x[adata_out.obs.index]

    # Put count matrices into 'layers' in anndata for clarity.
    adata_out.layers['counts_raw'] = adata_10x.X.copy()
    adata_out.layers['counts_cellbender'] = adata_out.X.copy()

    # Pre-compute a bit of metadata.
    # adata_out.var['n_cellranger'] = np.array(
    #     adata_out.layers['cellranger'].sum(axis=0)
    # ).squeeze()
    # adata_out.var['n_cellbender'] = np.array(
    #     adata_out.layers['cellbender'].sum(axis=0)
    # ).squeeze()

    return adata_out
Ejemplo n.º 29
0
def read_10x_data(input_file, format_type='10x_h5', backed=None):
    if format_type == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif format_type == '10x_mtx':
        adata = sc.read_10x_mtx(input_file)
    elif format_type == '10x_h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif format_type == "10x_csv":
        adata = sc.read_csv(input_file)
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')

    adata.var_names_make_unique()
    return adata
Ejemplo n.º 30
0
def main(src_dir, out_dir, out_prefix, mito_prefix):
    sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
    sc.logging.print_versions()
    sc.settings.set_figure_params(dpi=80)
    sc.settings.figdir = out_dir + "/"
    adata = sc.read_10x_mtx(src_dir, var_names='gene_symbols', cache=True)
    adata.var_names_make_unique()
    adata2 = adata.copy()
    adata3 = adata.copy()
    adata4 = adata.copy()
    seurat_wf_plots(adata, out_dir, out_prefix, mito_prefix)
    recipe_seurat(adata2, out_dir, out_prefix)
    recipe_zheng17(adata3, out_dir, out_prefix)
    scanpy_qc(adata4, out_dir, out_prefix, mito_prefix)