Esempio n. 1
0
def test_read_10x_h5():
    sc.read_10x_h5(os.path.join(ROOT, '1.2.0',
                                'filtered_gene_bc_matrices_h5.h5'),
                   genome='hg19_chr21')
    sc.read_10x_h5(os.path.join(ROOT, '3.0.0',
                                'filtered_feature_bc_matrix.h5'),
                   genome='GRCh38_chr21')
Esempio n. 2
0
def test_error_missing_genome():
    legacy_pth = ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5'
    v3_pth = ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5'
    with pytest.raises(ValueError, match=r".*hg19_chr21.*"):
        sc.read_10x_h5(legacy_pth, genome="not a genome")
    with pytest.raises(ValueError, match=r".*GRCh38_chr21.*"):
        sc.read_10x_h5(v3_pth, genome="not a genome")
Esempio n. 3
0
def test_read_10x_h5_v1():
    spec_genome_v1 = sc.read_10x_h5(os.path.join(
        ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'),
                                    genome='hg19_chr21')
    nospec_genome_v1 = sc.read_10x_h5(
        os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'))
    assert_anndata_equal(spec_genome_v1, nospec_genome_v1)
Esempio n. 4
0
def test_read_10x_h5():
    spec_genome_v3 = sc.read_10x_h5(os.path.join(
        ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5'),
                                    genome='GRCh38_chr21')
    nospec_genome_v3 = sc.read_10x_h5(
        os.path.join(ROOT, '3.0.0', 'filtered_feature_bc_matrix.h5'))
    assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
Esempio n. 5
0
def test_read_visium():
    spec_genome_v3 = sc.read_10x_h5(
        ROOT / 'visium' / 'V1_Human_Heart_subsampled.h5',
        genome='GRCh38',
    )
    nospec_genome_v3 = sc.read_10x_h5(ROOT / 'visium' /
                                      'V1_Human_Heart_subsampled.h5')
    assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
Esempio n. 6
0
def test_read_10x_h5():
    spec_genome_v3 = sc.read_10x_h5(
        ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5',
        genome='GRCh38_chr21',
    )
    nospec_genome_v3 = sc.read_10x_h5(ROOT / '3.0.0' /
                                      'filtered_feature_bc_matrix.h5')
    assert_anndata_equal(spec_genome_v3, nospec_genome_v3)
Esempio n. 7
0
def test_read_10x_h5_v1():
    spec_genome_v1 = sc.read_10x_h5(
        ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5',
        genome='hg19_chr21',
    )
    nospec_genome_v1 = sc.read_10x_h5(ROOT / '1.2.0' /
                                      'filtered_gene_bc_matrices_h5.h5')
    assert_anndata_equal(spec_genome_v1, nospec_genome_v1)
Esempio n. 8
0
def read_files(mousefile, humanfile):
    """ Read into anndata objects and return """
    mouse = sc.read_10x_h5(mousefile)
    if humanfile != None:
        human = sc.read_10x_h5(humanfile)
    else:
        human = None
    return (mouse, human)
Esempio n. 9
0
def test_error_10x_h5_legacy(tmp_path):
    onepth = ROOT / '1.2.0' / 'filtered_gene_bc_matrices_h5.h5'
    twopth = tmp_path / "two_genomes.h5"
    with h5py.File(onepth, "r") as one, h5py.File(twopth, "w") as two:
        one.copy("hg19_chr21", two)
        one.copy("hg19_chr21", two, name="hg19_chr21_copy")
    with pytest.raises(ValueError):
        sc.read_10x_h5(twopth)
    sc.read_10x_h5(twopth, genome="hg19_chr21_copy")
Esempio n. 10
0
def check_inputs(mousefile, humanfile, cutoff_top, cutoff_bottom):
    """ Make sure you have two matrices: one from mouse, one from human. mouse should be aligned to mm10hg19 and human should be hg38 """
    if cutoff_top > 1 or cutoff_top < 0:
        raise NameError('cutoff has to be within 0 < cutoff < 1')
    if cutoff_bottom > 1 or cutoff_bottom < 0:
        raise NameError('cutoff has to be within 0 < cutoff < 1')
    try:
        sc.read_10x_h5(mousefile)
        if humanfile != None:
            sc.read_10x_h5(humanfile)
    except:
        print('the files are not readable as h5.')
Esempio n. 11
0
def main(argv):
    inputfile = ''
    outputfile = ''
    try:
        opts, args = getopt.getopt(argv, "hi:o:", ["ifile=", "ofile="])
    except getopt.GetoptError:
        print('test.py -i <inputfile.h5> -o <outputfile.h5ad>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('test.py -i <inputfile> -o <outputfile>')
            sys.exit()
        elif opt in ("-i", "--ifile"):
            inputfile = arg
        elif opt in ("-o", "--ofile"):
            outputfile = arg
    print('Input file is "', inputfile)
    print('Output file is "', outputfile)

    adata = sc.read_10x_h5(
        inputfile
        # var_names='gene_ids',
        # make_unique=False,
        # var_names='gene_symbols'
    )
    adata.var_names_make_unique()
    adata.write(outputfile)  # , compression='gzip')
Esempio n. 12
0
def read_cellranger(fn, args, rm_zero_cells=True, add_sample_id=True, **kw):
    """read cellranger results

    Assumes the Sample_ID may be extracted from cellranger output dirname, 
    e.g ` ... /Sample_ID/outs/filtered_feature_bc_matrix.h5 `
    """
    if fn.endswith('.h5'):
        dirname = os.path.dirname(fn)
        data = sc.read_10x_h5(fn)
        data.var['gene_symbols'] = list(data.var_names)
        data.var_names = list(data.var['gene_ids'])
    else:
        mtx_dir = os.path.dirname(fn)
        dirname = os.path.dirname(mtx_dir)
        data = sc.read_10x_mtx(mtx_dir, gex_only=args.gex_only, var_names='gene_ids')
        data.var['gene_ids'] = list(data.var_names)
    
        
    if add_sample_id:
        barcodes = [b.split('-')[0] for b in data.obs.index]
        if len(barcodes) == len(set(barcodes)):
            data.obs_names = barcodes
        sample_id = os.path.basename(os.path.dirname(dirname))
        data.obs['sample_id'] = sample_id
        data.obs['sample_id'] = data.obs['sample_id'].astype('category')
        data.obs_names = [i + '-' + sample_id for i in data.obs_names]
        
    return data
Esempio n. 13
0
def read_sample(
    sample_meta,
    min_genes=200,
    min_cells=5,
):
    sample = sample_meta.Sample
    sample_dir = sample_meta.__dir

    ds = sc.read_10x_h5(
        os.path.join(sample_dir, sample, "outs",
                     "filtered_feature_bc_matrix.h5"))
    ds.var_names = rename_genes(ds.var_names)
    ds.var_names_make_unique(join=".")
    ds.obs_names = sample + "_" + ds.obs_names.str.replace(
        "-\d$", "", regex=True)
    sc.pp.filter_cells(ds, min_genes=min_genes)
    sc.pp.filter_genes(ds, min_cells=min_cells)

    if "__filter_cells" in sample_meta and type(
            sample_meta.__filter_cells) == str:
        sample_cells = pd.read_csv(sample_meta.__filter_cells, index_col=0)
        sample_cells.index = sample_cells.index.str.replace("-\d+$",
                                                            "",
                                                            regex=True)
        ds = ds[ds.obs_names.isin(sample_cells.index), :]

    ds.layers["counts"] = ds.X
    for k, v in sample_meta.iteritems():
        if k.startswith("__"):
            continue
        ds.obs[k] = v
    return ds
Esempio n. 14
0
def run(args):
    """Compile an AnnData object from a 10X h5 file
    
    """
    # Parse options...
    options = args
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    # take options
    h5_fname = options.ifile
    out_fname = options.ofile

    # read
    adata = sc.read_10x_h5(h5_fname)

    # add n_counts and n_kmers to adata.obs
    adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
    adata.obs['n_kmers'] = np.sum(adata.X > 0, axis=1).A1

    # add n_cells to adata.var
    adata.var['n_cells'] = np.sum(adata.X > 0, axis=0).A1

    # save adata to h5ad
    adata.write_h5ad(filename=out_fname)
    return
Esempio n. 15
0
def run_scrublet(tenx_h5, doublet_rate=0.06, npca=40, save_to=None):
    if not save_to:
        raise ValueError(
            "Please, specify prefix path where to save results to")
    if tenx_h5.endswith(".h5"):
        ds = sc.read_10x_h5(tenx_h5)
        counts_matrix = ds.X.tocsc().astype(np.longlong)
        obs = ds.obs.reset_index()
        obs.columns = ["0"]
    else:
        counts_matrix = scipy.io.mmread(gzip.open(tenx_h5 +
                                                  '/matrix.mtx.gz')).T.tocsc()
        obs = pd.read_table(gzip.open(tenx_h5 + '/barcodes.tsv.gz'),
                            header=None)
    #features = pd.read_table(gzip.open(input_dir + '/features.tsv.gz'), header=None)
    #genes = scr.make_genes_unique(features[1])
    scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=doublet_rate)
    doublet_scores, doublets = scrub.scrub_doublets(
        min_counts=2,
        min_cells=3,
        min_gene_variability_pctl=85,
        n_prin_comps=npca)
    save_dir = os.path.dirname(save_to)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    obs['doublet'] = doublet_scores
    obs.to_csv(save_to + 'doublets.csv')
    scrub.plot_histogram()
    plt.savefig(save_to + 'doublet_hist.pdf')
    if not os.path.exists(save_to + 'threshold.txt'):
        with open(save_to + 'threshold.txt', 'w') as f:
            f.write(str(scrub.threshold_))
Esempio n. 16
0
def read_10x_data(input_file,
                  format_type='10x_h5',
                  backed=None,
                  transpose=False,
                  sparse=False):
    if format_type == '10x_h5':
        adata = sc.read_10x_h5(input_file)
    elif format_type == '10x_mtx':
        adata = sc.read_10x_mtx(input_file)
    elif format_type == '10x_h5ad':
        adata = sc.read_h5ad(input_file, backed=backed)
    elif format_type == "10x_csv":
        adata = sc.read_csv(input_file)
    elif format_type == "10x_txt":
        adata = sc.read_csv(input_file, delimiter="\t")
    else:
        raise ValueError('`format` needs to be \'10x_h5\' or \'10x_mtx\'')

    if transpose:
        adata = adata.transpose()
    if sparse:
        adata.X = csr_matrix(adata.X, dtype='float32')
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    return adata
Esempio n. 17
0
def read_adata(
        gex_data, # filename
        gex_data_type # string describing file type
):
    ''' Split this out so that other code can use it. Read GEX data
    '''
    print('reading:', gex_data, 'of type', gex_data_type)
    if gex_data_type == 'h5ad':
        adata = sc.read_h5ad( gex_data )

    elif gex_data_type == '10x_mtx':
        adata = sc.read_10x_mtx( gex_data )

    elif gex_data_type == '10x_h5':
        adata = sc.read_10x_h5( gex_data, gex_only=True )

    elif gex_data_type == 'loom':
        adata = sc.read_loom( gex_data )

    else:
        print('unrecognized gex_data_type:', gex_data_type, "should be one of ['h5ad', '10x_mtx', '10x_h5', 'loom']")
        exit()

    if adata.isview: # this is so weird
        adata = adata.copy()
    return adata
Esempio n. 18
0
def test_read_10x(tmp_path, mtx_path, h5_path, prefix):
    if prefix is not None:
        # Build files named "prefix_XXX.xxx" in a temporary directory.
        mtx_path_orig = mtx_path
        mtx_path = tmp_path / "filtered_gene_bc_matrices_prefix"
        mtx_path.mkdir()
        for item in mtx_path_orig.iterdir():
            if item.is_file():
                shutil.copyfile(item, mtx_path / f"{prefix}{item.name}")

    mtx = sc.read_10x_mtx(mtx_path, var_names="gene_symbols", prefix=prefix)
    h5 = sc.read_10x_h5(h5_path)

    # Drop genome column for comparing v3
    if "3.0.0" in str(h5_path):
        h5.var.drop(columns="genome", inplace=True)

    # Check equivalence
    assert_anndata_equal(mtx, h5)

    # Test that it can be written:
    from_mtx_pth = tmp_path / "from_mtx.h5ad"
    from_h5_pth = tmp_path / "from_h5.h5ad"

    mtx.write(from_mtx_pth)
    h5.write(from_h5_pth)

    assert_anndata_equal(sc.read_h5ad(from_mtx_pth), sc.read_h5ad(from_h5_pth))
Esempio n. 19
0
    def _load_expression(self, clustering, tsne, diffexp):
        expression_file_v3 = os.path.join(self.args.indir,
                                          "filtered_feature_bc_matrix.h5")
        expression_file_v2 = os.path.join(self.args.indir,
                                          "filtered_gene_bc_matrices_h5.h5")
        if os.path.isfile(expression_file_v3):
            expression_file = expression_file_v3
        elif os.path.isfile(expression_file_v2):
            expression_file = expression_file_v2
        else:
            raise ScelVisException("cannot find expression file at %s" %
                                   self.args.indir)
        logger.info("Reading gene expression from %s", expression_file)
        with with_log_level(anndata.utils.logger, logging.WARN):
            ad = sc.read_10x_h5(expression_file)
        ad.var_names_make_unique()
        logger.info("Combining meta data")
        ad.obs["cluster"] = clustering
        ad.obs["n_counts"] = ad.X.sum(1).A1
        ad.obs["n_genes"] = (ad.X > 0).sum(1).A1
        logger.info("Adding coordinates")
        ad.obsm["X_tsne"] = tsne.values
        logger.info("Saving top %d markers per cluster", self.args.nmarkers)
        markers = (diffexp[(diffexp["p_adj"] < 0.05)
                           & (diffexp["log2_fc"] > 0)].drop(
                               "GeneID", axis=1).sort_values([
                                   "Cluster", "p_adj"
                               ]).groupby("Cluster").head(self.args.nmarkers))
        for col in markers.columns:
            ad.uns["marker_" + col] = markers[col].values

        return ad
Esempio n. 20
0
def main():
    parser = build_parser()
    args = parser.parse_args()

    _, input_ext = os.path.splitext(args.input)
    if input_ext == ".h5ad":
        x = ad.read_h5ad(args.input)
    elif input_ext == ".h5":
        x = sc.read_10x_h5(args.input)
    else:
        raise ValueError(f"Unrecognized file extension: {args.input}")
    logging.info(f"Read input: {x}")

    logging.info("Reading gtf for gene name map")
    gene_name_map = utils.read_gtf_gene_symbol_to_id()

    # Tranpose because BIRD wants features x obs
    x_df = pd.DataFrame(utils.ensure_arr(x.X),
                        index=x.obs_names,
                        columns=x.var_names).T
    assert np.all(x_df.values >= 0.0)
    x_df.index = [gene_name_map[g] for g in x_df.index]

    # Write output (tab-separated table
    logging.info(f"Writing output to {args.output_table_txt}")
    x_df.to_csv(args.output_table_txt, sep="\t")
Esempio n. 21
0
def Create_Scanpy_Anndata(storage_mount_point, sampleID, annotation_dict):
    '''
	In:
	storage_mount_point: Data storage mount location
	sampleID: ID numbers of samples from the metadata table (ex: 2235-1)
	annotation_dict: Dictionary of all the sample IDs and metadata

	Out:
	New filled AnnData object
	'''
    metadata_list = annotation_dict[sampleID][1:]
    newAdata = sc.read_10x_h5(''.join(
        [storage_mount_point,
         annotation_dict[sampleID][0]]))  # genome='hg19' or genome='GRCh38'

    ## Set gene names to be unique since there seem to be duplicate names from Cellranger
    newAdata.var_names_make_unique()

    ## Add metadata for each sample to the observation (cells) annotations in the Anndata objects
    print('\nAdding Metadata to individual samples.\n')
    for field in metadata_list:
        field_list = str.split(field, ':')
        meta_name = field_list[0]
        meta_value = field_list[1]
        newAdata.obs[meta_name] = meta_value
    return (newAdata)
Esempio n. 22
0
def load_ds(path):
    """
    H5 files are named like this GSM4698176_Sample_1_filtered_feature_bc_matrix.h5

    Fastcar outputs are in …fastcar/{sample}/matrix.mtx.gz
    """
    fname = os.path.basename(path)
    if os.path.exists(os.path.join(path, "matrix.mtx.gz")):
        sample = fname
        ds = sc.read_10x_mtx(path)
    elif fname.endwith(".h5"):
        sample = "_".join(fname.split("_")[1:3])
        ds = sc.read_10x_h5(path)
    else:
        raise ValueError(f"Unknown input path {path}")
    ds.var_names = rename_genes(ds.var_names)
    ds.var_names_make_unique(join=".")
    ds.obs["orig.ident"] = sample
    ds.obs_names = sample + "_" + ds.obs_names.str.replace("-\d$", "")
    sc.pp.filter_cells(ds, min_genes=200)
    sc.pp.filter_genes(ds, min_cells=3)

    meta = SAMPLES.loc[SAMPLES.Sample == sample, :]
    ds.obs["Patient"] = meta.Patient.values[0]
    ds.obs["Day of intubation"] = meta["Day of intubation"].values[0]
    ds.obs["COVID-19"] = meta["COVID-19"].values[0]
    ds.obs["Sample"] = sample
    return ds
Esempio n. 23
0
def read_10x(input_10x_h5,
             input_10x_mtx,
             genome='hg19',
             var_names='gene_symbols',
             extra_obs=None,
             extra_var=None):
    """
    Wrapper function for sc.read_10x_h5() and sc.read_10x_mtx(), mainly to
    support adding extra metadata
    """
    if input_10x_h5 is not None:
        adata = sc.read_10x_h5(input_10x_h5, genome=genome)
    elif input_10x_mtx is not None:
        adata = sc.read_10x_mtx(input_10x_mtx, var_names=var_names)

    if extra_obs:
        obs_tbl = pd.read_csv(extra_obs, sep='\t', header=0, index_col=0)
        adata.obs = adata.obs.merge(
            obs_tbl,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )

    if extra_var:
        var_tbl = pd.read_csv(extra_var, sep='\t', header=0, index_col=0)
        adata.var = adata.var.merge(
            var_tbl,
            how='left',
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )
    return adata
Esempio n. 24
0
def test_filter():
    adata = sc.read_10x_h5("tests/sctest.h5")
    test = ddl.read_h5("tests/test.h5")
    adata.obs["filter_rna"] = False
    test, adata = ddl.pp.filter_bcr(test, adata)
    adata.write("tests/sctest.h5ad", compression="gzip")
    test.write_h5("tests/test.h5", compression="bzip2")
    print(test)
Esempio n. 25
0
def test_read_10x_v1():
    v1_mtx = sc.read_10x_mtx(os.path.join(ROOT, '1.2.0',
                                          'filtered_gene_bc_matrices',
                                          'hg19_chr21'),
                             var_names='gene_symbols')
    v1_h5 = sc.read_10x_h5(
        os.path.join(ROOT, '1.2.0', 'filtered_gene_bc_matrices_h5.h5'))
    assert_anndata_equal(v1_mtx, v1_h5)
Esempio n. 26
0
def test_read_10x_v3():
    v3_mtx = sc.read_10x_mtx(
        ROOT / '3.0.0' / 'filtered_feature_bc_matrix',
        var_names='gene_symbols',
    )
    v3_h5 = sc.read_10x_h5(ROOT / '3.0.0' / 'filtered_feature_bc_matrix.h5')
    v3_h5.var.drop(columns="genome", inplace=True)
    assert_anndata_equal(v3_mtx, v3_h5)
Esempio n. 27
0
def from_10x_HDF5(filename, genome=None):

    ad = sc.read_10x_h5(filename, genome, True)

    dataMatrix = pd.DataFrame(ad.X.todense(),
                              columns=ad.var_names,
                              index=ad.obs_names)

    return _clean_up(dataMatrix)
Esempio n. 28
0
    def from_matrix_h5(cls, path_to_matrix):
        """
        Factory function for reading gene expression matrix from hdf5 formatted
        10x output.
        """

        anndata_matrix = scanpy.read_10x_h5(path_to_matrix, gex_only=True)
        anndata_matrix.var_names_make_unique()

        return cls(anndata_matrix)
Esempio n. 29
0
def read_10x_h5(filename: PathLike,
                extended: bool = True,
                *args,
                **kwargs) -> MuData:
    """
    Read data from 10X Genomics-formatted HDF5 file

    This function uses scanpy.read_10x_h5() internally
    and patches its behaviour to:
    - attempt to read `interval` field for features;
    - attempt to locate peak annotation file and add peak annotation;
    - attempt to locate fragments file.

    Parameters
    ----------
    filename : str
            Path to 10X HDF5 file (.h5)
    extended : bool, optional (default: True)
            Perform extended functionality automatically such as
            locating peak annotation and fragments files.
    """

    adata = sc.read_10x_h5(filename, gex_only=False, *args, **kwargs)

    # Patches sc.read_10x_h5 behaviour to:
    # - attempt to read `interval` field for features from the HDF5 file
    # - attempt to add peak annotation
    # - attempt to locate fragments file

    if extended:

        # 1) Read interval field from the HDF5 file
        h5file = h5py.File(filename, "r")

        if "interval" in h5file["matrix"]["features"]:
            intervals = np.array(
                h5file["matrix"]["features"]["interval"]).astype(str)

            h5file.close()

            adata.var["interval"] = intervals

            print(f"Added `interval` annotation for features from {filename}")

        else:
            # Make sure the file is closed
            h5file.close()

    mdata = MuData(adata)

    if extended:
        if "atac" in mdata.mod:
            initialise_default_files(mdata, filename)

    return mdata
Esempio n. 30
0
def basic_analysis(filename):
    adata = sc.read_10x_h5(filename)
    sc.pp.recipe_zheng17(adata)
    sc.pp.neighbors(adata)
    sc.tl.louvain(adata)
    sc.tl.umap(adata)
    sc.tl.rank_genes_groups(adata, "louvain")
    adata.write("./write/result.h5ad")
    # plotting
    sc.pl.umap(adata, color="louvain", save=".png")
    sc.pl.rank_genes_groups(adata, save=".pdf")