Ejemplo n.º 1
0
def load_or_train_scvi_model(model_name=model_name, anndata_path=anndata_path):
    # Try loading model, if it doesn't exist train from scratch
    print('Trying to load or train model...')
    try:
        model = scvi.model.SCVI.load(model_name)
        print('Loaded model:', model_name)
    except:
        ### DEFINE AND TRAIN MODEL
        # these hyperparameters are fine for a small dataset, with a few batches
        # if integration is a problem then you can try increasing the layers to 3
        # and hidden units to 256

        print('Creating and training model:', model_name)

        adata = anndata.read_h5ad(anndata_path)
        print(adata)
        print('Restricting to genes with minimum counts of ', min_gene_counts)
        adata.var['gene_counts'] = np.squeeze(np.asarray(adata.X.sum(0)))
        adata = adata[:, adata.var.gene_counts > min_gene_counts]
        print(adata)

        ## register adata with SCVI, for more information see
        ## https://docs.scvi-tools.org/en/stable/api/reference/scvi.data.setup_anndata.html
        adata.layers["counts"] = adata.X.copy().tocsr()  # converts to CSR format, preserve counts
        scvi.data.setup_anndata(adata,
                                layer="counts",
                                batch_key=batch_key)

        # typically you don't need to go tweak these parameters for training a model
        model = scvi.model.SCVI(adata,
                                n_hidden=256,
                                n_layers=2,
                                gene_likelihood='nb',
                                dispersion='gene-batch'
                                )

        # MODEL TRAINING
        # this model will train quickly even without a GPU, 25 epochs is not quite enough to
        # finish training, but this notebook is meant to run quickly just for showing the entire
        # data generation pipeline

        model.train(check_val_every_n_epoch=1,
                    use_gpu=True,
                    max_epochs=125,
                    plan_kwargs={'lr': 1e-3})

        train_test_results = model.history['elbo_train']
        train_test_results['elbo_validation'] = model.history['elbo_validation']

        ### MAKE SURE THE MODEL FINISHED TRAINING FOR BEST RESULTS
        print(train_test_results)
        model.save(model_name, save_anndata=True)
        # save the training results to a csv for inspection if needed
        train_test_results.to_csv(model_name + '+train_test_results.csv')
    return model
Ejemplo n.º 2
0
    def populate(self):
        ad = anndata.read_h5ad(
            os.path.join(self.save_path, self.filenames[0])
        )  # obs = cells, var = genes

        # extract GeneExpressionDataset relevant attributes
        # and provide access to annotations from the underlying AnnData object.
        (
            X,
            batch_indices,
            labels,
            gene_names,
            cell_types,
            obs,
            obsm,
            var,
            _,
            uns,
        ) = extract_data_from_anndata(
            ad,
            batch_label=self.batch_label,
            ctype_label=self.ctype_label,
            class_label=self.class_label,
            use_raw=self.use_raw,
        )
        # Dataset API takes a dict as input
        obs = obs.to_dict(orient="list")
        var = var.to_dict(orient="list")

        # add external cell measurements
        Ys = []
        if self.cell_measurements_col_mappings_temp is not None:
            for name, attr_name in self.cell_measurements_col_mappings_temp.items():
                columns = uns[attr_name]
                measurement = CellMeasurement(
                    name=name,
                    data=obsm[name],
                    columns_attr_name=attr_name,
                    columns=columns,
                )
                Ys.append(measurement)

        self.populate_from_data(
            X=X,
            Ys=Ys,
            labels=labels,
            batch_indices=batch_indices,
            gene_names=gene_names,
            cell_types=cell_types,
            cell_attributes_dict=obs,
            gene_attributes_dict=var,
        )
        self.filter_cells_by_count()

        del self.cell_measurements_col_mappings_temp
Ejemplo n.º 3
0
def read_data(filename,seed=1,nsample=3000000,dlevel='cell_ontology_class_reannotated',exclude_tissues=['marrow'], return_genes=False, DATA_DIR = '../../OnClass_data/'):
	name2co = get_ontology_name(DATA_DIR = DATA_DIR)[1]
	np.random.seed(seed)
	if 'facs' in filename:
		tech = 'facs'
	elif 'droplet' in filename:
		tech = 'droplet'
	else:
		tech = ''
	if not os.path.isfile(filename):
		sys.exit('%s not exist' % filename)
	x = read_h5ad(filename)
	ncell = np.shape(x.X)[0]
	dataset = x.X
	months = np.array(x.obs['age'].tolist())
	labels = np.array(x.obs[dlevel].tolist())
	tissues = np.array(x.obs['tissue'].tolist())

	ind = []
	for i in range(ncell):
		tis = tissues[i]
		lab = labels[i]
		if tis.lower() in exclude_tissues or lab.lower() not in name2co:
			#print ('%s %s' % (tis, lab))
			continue
		ind.append(i)
	ind = np.array(ind)
	dataset = dataset[ind,:]
	months = months[ind]
	labels = labels[ind]
	tissues = tissues[ind]
	annot = [name2co[y.lower()] for y in labels]
	annot = np.array(annot)

	datanames = []
	genes_list = {}
	labels = {}
	datasets = {}
	types = {}
	month_labels = {}
	uniq_age = np.unique(months)
	for m in uniq_age:
		dataname = tech+m
		datanames.append(dataname)
		index = np.array(months == m)
		datasets[dataname] = dataset[index,:]
		genes_list[dataname] = x.var.index
		labels[dataname] = annot[index]
		month_labels[dataname] = np.full(len(annot), len(index))
		types[dataname] = Counter(np.array(annot)[index])
	all_X, all_Y = extract_data(datanames, datasets, labels)
	if return_genes:
		return all_X, all_Y, genes_list
	else:
		return all_X, all_Y
Ejemplo n.º 4
0
def read_h5ad_st(cnt_pth: List[str], ) -> pd.DataFrame:
    """read spatial data from h5ad

    Parameters:
    ----------
    cnt_pth : List[str]
        paths to spatial data (h5ad) files

    Returns:
    -------
    Pandas DataFrame of a joint matrix. Data points
    from the same file will share the same
    rowname prefix k, in the
    joint matrix rowname given as:
    "k&-[x-coordinate]x[y-coordinate]" if
    coordinates are identified. Otherwise
    the rownames are given as:
    "k&-orignal-rowname"

    """

    _cnts = list()
    for k, p in enumerate(cnt_pth):
        _data = ad.read_h5ad(p)

        if "x" in _data.obs.keys():
            new_idx = [str(k) + "&-" + str(x)+"x"+str(y) for\
                        x,y in zip(_data.obs["x"].values,
                                    _data.obs['y'].values,
                                    )]

        elif "spatial" in _data.obsm.keys():

            new_idx = [str(k) + "&-" + str(x)+"x"+str(y) for\
                        x,y in _data.obsm["spatial"]]
        else:
            new_idx = [str(k) + "&-" + str( x ) for\
                        x in _data.obs_names ]

        new_idx = pd.Index(new_idx)
        _data = pd.DataFrame(
            grab_anndata_counts(_data),
            index=new_idx,
            columns=_data.var_names,
        )
        _cnts.append(_data)

    cnts = pd.concat(_cnts, join="outer")

    del _cnts, _data

    cnts[pd.isna(cnts)] = 0.0
    cnts = cnts.astype(float)

    return cnts
Ejemplo n.º 5
0
    def preprocess(self):
        print("Preprocessing dataset")

        ad = anndata.read_h5ad(self.save_path + self.download_name)  # obs = cells, var = genes
        gene_names = np.array(ad.var.index.values, dtype=str)
        data = ad.X.toarray()
        select = data.sum(axis=1) > 0  # Take out cells that doesn't express any gene
        data = data[select, :]

        print("Finished preprocessing dataset")
        return data, gene_names
Ejemplo n.º 6
0
    def test_anndata(self):
        adata = read_h5ad(join('data', 'test.h5ad'))
        w = AnnDataWrapper(adata, cell_set_obs=['CellType'], mappings_obsm=['X_umap'], mappings_obsm_names=['UMAP'])

        cells_creator = w.make_cells_file_def_creator('A', 0)
        cells = cells_creator( 'http://localhost:8000')
        self.assertEqual(cells, {'type': 'cells', 'fileType': 'anndata-cells.zarr', 'url': 'http://localhost:8000/A/0/anndata.zarr', 'options': { "mappings": { 'UMAP': { 'dims': [0, 1], 'key': 'obsm/X_umap' } } } })

        cell_sets_creator = w.make_cell_sets_file_def_creator('A', 0)
        cell_sets = cell_sets_creator( 'http://localhost:8000')
        self.assertEqual(cell_sets, {'type': 'cell-sets', 'fileType': 'anndata-cell-sets.zarr', 'url': 'http://localhost:8000/A/0/anndata.zarr', 'options': [{'groupName': 'CellType', 'setName': 'obs/CellType'}]})
Ejemplo n.º 7
0
def plotGene(exFn, gene):
    prRes = anndata.read_h5ad(exFn)
    if gene in prRes.var.index:
        sc.pl.umap(prRes, color=[gene])
    else:
        print(
            "Error! please check your input gene ID, it must be the same as in your expression file"
        )
        print(
            "Also, the missing gene could be caused by the dispersion based gene filtering by the prerun program"
        )
Ejemplo n.º 8
0
def read_adata(adata_pth):
    adata = ad.read_h5ad(adata_pth)
    if hasattr(adata,"obsm") and\
        "spatial" in adata.obsm.keys():
        crd = adata.obsm["spatial"]
    elif "x" in adata.obs.keys() and\
         "y" in adata.obs.keys():
        crd = adata.obs[["x","y"]].values
    else:
        ValueError("No spatial coordinates found")
    return crd
Ejemplo n.º 9
0
def read_annotations(data):
    if len(data) == 1:
        ext = os.path.splitext(data[0])[-1]
        if ext == '.h5ad':
            res = anndata.read_h5ad(
                data[0])[:, :0].copy()  #Slice to remove all gene observations
            return res
        else:
            return None
    elif len(data) == 3:
        return None
Ejemplo n.º 10
0
def load_palantir_data(smoothed=False):
    fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad'
    an = anndata.read_h5ad(fn)

    genes = an.var_names
    cells = an.obs_names

    if not smoothed:
        counts = singlet.CountsTable(
            data=an.raw.X.todense().T,
            index=genes,
            columns=cells,
        )
    else:
        counts = singlet.CountsTable(
            data=an.obsm['MAGIC_imputed_data'].T,
            index=genes,
            columns=cells,
        )

    ss = singlet.SampleSheet(an.obs)
    ss['tsne_1'] = an.obsm['tsne'][:, 0]
    ss['tsne_2'] = an.obsm['tsne'][:, 1]
    ss['clusters'] = ss['clusters'].astype(str)

    ds = singlet.Dataset(
        counts_table=counts,
        samplesheet=ss,
    )

    ds.samplesheet['Cell Subtype'] = ds.samplesheet['clusters'].replace({
        '0':
        'HSC',
        '1':
        'HSC',
        '2':
        'Ery-precursor',
        '3':
        'Mono',
        '4':
        'Mono-precursor',
        '5':
        'CLP',
        '6':
        'Mono',
        '7':
        'pDC',
        '8':
        'Ery',
        '9':
        'Mega',
    })

    return ds
Ejemplo n.º 11
0
    def extract_anndata_elements_from_file(self):
        logging.info(
            f"Reading in AnnData dataset: {path.basename(self.input_filename)}"
        )
        self.anndata = anndata.read_h5ad(self.input_filename,
                                         backed="r" if self.backed else None)
        logging.info("Completed reading in AnnData dataset!")

        self.obs = self.transform_dataframe_index_into_column(
            self.anndata.obs, "obs", self.obs_index_column_name)
        self.var = self.transform_dataframe_index_into_column(
            self.anndata.var, "var", self.vars_index_column_name)
Ejemplo n.º 12
0
def test_read(tmpdir):
    from read_partial_registry import read
    pth = tmpdir / "test.h5ad"

    orig = gen_adata((10, 20))
    orig.write(pth, compression="lzf")

    truth = ad.read_h5ad(pth)
    test = read(pth)

    assert_equal(truth, test)
    assert_equal(orig, test)
Ejemplo n.º 13
0
def load(tempdir,
         task,
         dataset,
         test=None,
         method=None,
         dependency="a related test"):
    """Load a cached h5ad file."""
    data_path = _cache_name(tempdir, task, dataset, test=test, method=method)
    assert os.path.isfile(
        data_path), "Intermediate file missing. Did {} fail?".format(
            dependency)
    return anndata.read_h5ad(data_path)
Ejemplo n.º 14
0
def qc_checks(args):
    adata = anndata.read_h5ad(args.filtered_feature_matrix)
    adata.var_names_make_unique()

    qc_by_cell, qc_by_gene = sc.pp.calculate_qc_metrics(adata)

    # current directory is set up by the CWL runner
    qc_path = Path('qc_results.hdf5').absolute()
    print('Saving QC results to', qc_path)
    with pd.HDFStore(qc_path) as store:
        store['qc_by_cell'] = qc_by_cell
        store['qc_by_gene'] = qc_by_gene
Ejemplo n.º 15
0
 def _set_file_parameters(self):
     import anndata
     try:
         adata = anndata.read_h5ad(self._file_name)
         self.n_rows, self.n_cols = adata.shape
         all_el = self.n_rows * self.n_cols
         if sp.issparse(adata.X):
             self.sparsity = (all_el - adata.X.tocsr().count_nonzero()) / all_el
         else:
             self.sparsity = (all_el - np.count_nonzero(adata.X)) / all_el
     except OSError:
         pass
def viral_enrichment_significance(data, clusters_col, save_str, perm_no):
    perm_file = save_str+"_viral_perm.h5ad"
    fdr_file = save_str + "_FDRs.csv"

    if not os.path.exists(perm_file):
        print("The file "+perm_file+" doesn't exist.")
        print("(A) Shuffling viral data...")
        # A. Shuffle the viral UMI counts across cells 100 times and compute the enrichment per cluster for each permutation
        cell_no = data.shape[0]
        data_perm = data.copy()
        data_perm.obs.reset_index(inplace=True)
        for i in range(0,perm_no):
            rand_indx = random.sample(range(0,cell_no), cell_no)
            data_perm.obs["ViralPerm_" + str(i)] = data_perm.obs.loc[rand_indx,"Viral+"].reset_index(drop=True)
            viral_enrichment_score(data_perm, clusters_col, viral_count_col="ViralPerm_" + str(i), suffix="_"+str(i))

        data_perm.write_h5ad(perm_file)
    else:
        if not os.path.exists(fdr_file):
            print("(A) Reading existing viral-shuffeling file: "+perm_file)
            data_perm = an.read_h5ad(perm_file)

    # B. Distribution -> p-val
    if not os.path.exists(fdr_file):
        print("(B) Computing FDR from distribution of enrichment scores...")
        # Collecting enrichment score per permutation
        curr_cols = [clusters_col]
        col_name = "ViralEnrichment"
        curr_cols.extend([col for col in data_perm.obs.columns if re.search(col_name+"_", col)])
        df_for_summary = data_perm.obs[curr_cols]
        enrich_df = pd.DataFrame(index=data_perm.obs[clusters_col].drop_duplicates().sort_values())
        for i in range(0,perm_no):
            x = df_for_summary[["Enrichment_"+str(i), clusters_col]].drop_duplicates()
            enrich_df = enrich_df.merge(x, left_index=True, right_on=clusters_col).set_index(clusters_col)
        pd.DataFrame.transpose(enrich_df).hist(bins=15)

        # comparing real enrichments per cluster to permutations
        real_enrichments = data.obs[[clusters_col, "Enrichment_"]].drop_duplicates().set_index(clusters_col)
        FDR= {}
        for clust in set(data.obs[clusters_col]):
            FDR[clust] = (sum(float(real_enrichments.loc[clust]) <= enrich_df.loc[clust,:])+1) / (perm_no+1)
        FDR_df = pd.DataFrame.from_dict(FDR, orient="index").reset_index().rename(columns={"index":"clusters",0:"FDR"})
        FDR_df.to_csv(save_str+"_FDRs.csv")
    else:
        print("(B) Reading existing FDR file: " + fdr_file)
        FDR_df = pd.read_csv(fdr_file, header=0, index_col=[0])

    adj_FDR = fdrcorrection(FDR_df["FDR"])
    FDR_df["adj_FDR"] = adj_FDR[1]
    FDR_df.to_csv(save_str + "_FDRs.csv")

    return FDR_df
Ejemplo n.º 17
0
def load_normalized_data(file_path,
                         data_name='facs',
                         flag_size_factor=True,
                         total_ct_per_cell=1e4,
                         flag_log1p=True):
    """load normalized data
    1. Load filtered data for both FACS and droplet
    2. Size factor normalization to counts per 1 million (total_ct_per_cell)
    3. log(x+1) transform
    4. Combine the data 

    Args:
        file_path (str): file path. Should contain both FACS data facs_filtered.h5ad and droplet data droplet_filtered.h5ad

    Returns:
        adata_combine (AnnData): Combined data for FACS and droplet
    """

    if data_name == 'facs':
        file_name = 'tabula-muris-senis-facs-official-raw-obj.h5ad'
    elif data_name == 'droplet':
        file_name = 'tabula-muris-senis-droplet-official-raw-obj.h5ad'
    elif data_name == 'facs_old':
        file_name = 'facs_filtered.h5ad'
    elif data_name == 'droplet_old':
        file_name = 'droplet_filtered.h5ad'
    else:
        return None

    # Load filtered data
    adata = read_h5ad(f'{file_path}/{file_name}')

    # Update annotations
    adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
    adata.obs['n_counts'] = (adata.X).sum(axis=1)
    adata.obs['age_num'] = [int(x.replace('m', '')) for x in adata.obs['age']]

    # Size factor normalization
    if flag_size_factor == True:
        sc.pp.normalize_per_cell(adata,
                                 counts_per_cell_after=total_ct_per_cell)

    # log(x+1) transform
    if flag_log1p == True:
        sc.pp.log1p(adata)

    # Filter data
    if 'facs' in data_name:
        ind_select = adata.obs['age'].isin(['3m', '18m', '24m'])
        adata = adata[ind_select, ]

    return adata
Ejemplo n.º 18
0
def test_backed_anndata_scvi(save_path):
    adata = scvi.data.synthetic_iid()
    path = os.path.join(save_path, "test_data.h5ad")
    adata.write_h5ad(path)
    adata = anndata.read_h5ad(path, backed="r+")
    setup_anndata(adata, batch_key="batch")

    model = SCVI(adata, n_latent=5)
    model.train(1, train_size=0.5)
    assert model.is_trained is True
    z = model.get_latent_representation()
    assert z.shape == (adata.shape[0], 5)
    model.get_elbo()
Ejemplo n.º 19
0
def test_to_memory_full(tmp_path, array_type):
    backed_pth = tmp_path / "backed.h5ad"
    mem_adata = gen_adata((15, 10), X_type=array_type)
    mem_adata.raw = gen_adata((15, 12), X_type=array_type)
    mem_adata.write_h5ad(backed_pth, compression="lzf")

    backed_adata = ad.read_h5ad(backed_pth, backed="r")
    assert_equal(mem_adata, backed_adata.to_memory())

    # Test that raw can be removed
    del backed_adata.raw
    del mem_adata.raw
    assert_equal(mem_adata, backed_adata.to_memory())
Ejemplo n.º 20
0
 def convert_pbmc3k(self, **kwargs):
     random_string = "".join(
         random.choice(string.ascii_letters) for _ in range(8))
     data_locator = f"/tmp/test_{random_string}.cxg"
     self.fixtures.append(data_locator)
     source_h5ad = anndata.read_h5ad(
         f"{PROJECT_ROOT}/example-dataset/pbmc3k.h5ad")
     write_cxg(adata=source_h5ad,
               container=data_locator,
               title="pbmc3k",
               **kwargs)
     config = app_config(data_locator)
     return CxgAdaptor(DataLocator(data_locator), config)
Ejemplo n.º 21
0
def read_h5ad(path):
    try:
        adata = anndata.read_h5ad(path)
    except:
        return "file_error"

    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)
    name_genes(adata, inplace=True)
    return adata
Ejemplo n.º 22
0
def test_backed_anndata(save_path):
    adata = scvi.data.synthetic_iid()
    path = os.path.join(save_path, "test_data.h5ad")
    adata.write_h5ad(path)
    adata = anndata.read_h5ad(path, backed="r+")
    adata_manager = generic_setup_adata_manager(adata, batch_key="batch")

    # test get item
    bd = AnnTorchDataset(adata_manager)
    bd[np.arange(adata.n_obs)]

    # sparse
    adata = scvi.data.synthetic_iid()
    adata.X = csr_matrix(adata.X)
    path = os.path.join(save_path, "test_data2.h5ad")
    adata.write_h5ad(path)
    adata = anndata.read_h5ad(path, backed="r+")
    adata_manager = generic_setup_adata_manager(adata, batch_key="batch")

    # test get item
    bd = AnnTorchDataset(adata_manager)
    bd[np.arange(adata.n_obs)]
def annotate_file(filtered_file: Path, unfiltered_file: Path,
                  token: str) -> Tuple[anndata.AnnData, anndata.AnnData]:
    # Get the directory
    data_set_dir = fspath(unfiltered_file.parent.stem)
    # And the tissue type
    tissue_type = get_tissue_type(data_set_dir, token)

    filtered_adata = anndata.read_h5ad(filtered_file)
    unfiltered_adata = anndata.read_h5ad(unfiltered_file)

    filtered_adata.obs['barcode'] = filtered_adata.obs.index
    filtered_adata.obs['dataset'] = data_set_dir
    filtered_adata.obs['organ'] = tissue_type
    filtered_adata.obs['modality'] = 'rna'

    cells = list(filtered_adata.obs.index)
    unfiltered_subset = unfiltered_adata[cells, :].copy()
    unfiltered_subset.obs = filtered_adata.obs
    unfiltered_subset.obsm = filtered_adata.obsm
    print(unfiltered_subset.obsm.keys())

    return unfiltered_subset.copy(), filtered_adata
Ejemplo n.º 24
0
    def get_train_test_data(self, data_file, split):
        file_object = anndata.read_h5ad(data_file)
        matrix = file_object.X

        num_cells_in_training = int(matrix.shape[1] * split)

        transpose = matrix.transpose()

        training_data = transpose[0:num_cells_in_training][:].transpose()
        test_data = transpose[num_cells_in_training:][:].transpose()

        print(f"Training data size : {training_data.shape}. Test data size : {test_data.shape}.")
        return training_data, test_data
Ejemplo n.º 25
0
def run(args):
    """UMAP
    
    """
    # Parse options...
    options = args
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    # take options
    h5ad_fname = options.afile
    out_fname = options.ofile

    data_source = options.data_source
    n_neighbors = options.n_neighbors
    min_dist = options.min_dist

    if data_source != "X":
        if data_source == "lsi":
            data_source = "lsa"
        data_source = "X_" + data_source

    random_state = options.random_state

    # read h5ad
    adata = read_h5ad(h5ad_fname)

    # UMAP
    m = umap.UMAP(metric="euclidean",
                  init="spectral",
                  random_state=random_state,
                  n_neighbors=n_neighbors,
                  min_dist=min_dist,
                  n_components=2).fit(adata.obsm[data_source])
    adata.obsm['X_umap'] = m.embedding_
    adata.uns['umap'] = {
        'params': {
            'metric': 'euclidean',
            'init': 'spectral',
            'random_state': random_state,
            'n_neighbors': n_neighbors,
            'min_dist': min_dist
        }
    }

    # write h5a
    adata.write_h5ad(filename=out_fname)
    return
Ejemplo n.º 26
0
def _transform_single_h5ad(
    adata_path,
    output_path,
    chrom_size_path,
    bin_size,
    step_size,
    window_size,
    compression,
):
    """
    Resize non-overlap chrom bin count adata
    """
    if (step_size % bin_size != 0) or (window_size % bin_size != 0):
        raise ValueError(
            "step_size and window_size need to be integral multiple of bin_size"
        )
    n = step_size // bin_size
    m = window_size // bin_size

    adata = anndata.read_h5ad(adata_path)

    # somehow, I need to copy this out otherwise its super slow
    chrom_idx = adata.var["chrom"].values.copy()
    csc_data = adata.X.tocsc()
    chrom_dict = parse_chrom_size(chrom_size_path)

    chrom_data_list = []
    for chrom in chrom_dict.keys():
        chrom_csc_data = csc_data[:, chrom_idx == chrom]
        chunk_generator = (
            ss.csc_matrix(chrom_csc_data[:, i : i + m].sum(axis=1))
            for i in range(0, chrom_csc_data.shape[1], n)
        )
        chrom_data = ss.hstack(list(chunk_generator))
        chrom_data_list.append(chrom_data)
    total_data = ss.hstack(chrom_data_list)

    # TODO add all necessary info in adata.uns
    adata = anndata.AnnData(
        X=total_data,
        obs=adata.obs,
        var=generate_chrom_bin_bed_dataframe(
            chrom_size_path, window_size=window_size, step_size=step_size
        ),
        uns=dict(
            bin_size=window_size, step_size=step_size, chrom_size_path=chrom_size_path
        ),
    )

    adata.write(filename=output_path, compression=compression)
    return output_path
Ejemplo n.º 27
0
def run(args):
    """Cluster
    
    """
    # Parse options...
    options = args
    # end of parsing commandline options
    info = options.info
    warn = options.warn
    debug = options.debug
    error = options.error

    # take options
    h5ad_fname = options.afile
    out_fname = options.ofile

    method = options.method
    data_source = options.data_source
    n_neighbors = options.n_neighbors

    if data_source != "X":
        if data_source == "lsi":
            data_source = "lsa"
        data_source = "X_" + data_source

    random_state = options.random_state
    dbscan_min_samples = options.min_samples
    dbscan_min_cluster_size = options.min_cluster_size

    # read h5ad
    adata = read_h5ad(h5ad_fname)

    # clustering
    if method == "louvain":
        adata = louvain_clustering(adata, data_source, n_neighbors=n_neighbors)
    elif method == "leiden":
        adata = leiden_clustering(adata, data_source, n_neighbors=n_neighbors)
    elif method == "dbscan":
        adata = dbscan_clustering(
            adata,
            data_source,
            dbscan_min_samples=dbscan_min_samples,
            dbscan_min_cluster_size=dbscan_min_cluster_size)
    elif method == "spectral":
        adata = spectral_clustering(adata,
                                    data_source,
                                    n_neighbors=n_neighbors)

    # write h5a
    adata.write_h5ad(filename=out_fname)
    return
Ejemplo n.º 28
0
def main(input_dir, output_dir):
    output_dir.mkdir(exist_ok=True)
    for h5ad_file in ["secondary_analysis.h5ad", "scvelo_annotated.h5ad"]:
        adata = read_h5ad(input_dir / h5ad_file)
        if "rank_genes_groups" in adata.uns:
            # Handle marker genes by putting top n per cluster in `obs` for `factors` visualization.
            marker_genes = []
            for i in range(NUM_MARKER_GENES_TO_VISUALIZE):
                adata.obs[f"marker_gene_{str(i)}"] = [
                    "" for v in adata.obs.index
                ]
                for cluster in adata.obs["leiden"]:
                    marker_gene = adata.uns["rank_genes_groups"]["names"][i][
                        cluster]
                    adata.obs[f"marker_gene_{str(i)}"][adata.obs["leiden"] ==
                                                       cluster] = marker_gene
                    marker_genes.append(marker_gene)
            adata.var["marker_genes_for_heatmap"] = [
                gene in marker_genes for gene in adata.var.index
            ]
        if "dispersions_norm" in adata.var:
            top_dispersion = adata.var["dispersions_norm"][sorted(
                range(len(adata.var["dispersions_norm"])),
                key=lambda k: adata.var["dispersions_norm"][k],
            )[-len(adata.obs['leiden'].unique()) *
              NUM_MARKER_GENES_TO_VISUALIZE:][0]]
            adata.var["top_highly_variable"] = (adata.var["dispersions_norm"] >
                                                top_dispersion)
        for layer in adata.layers:
            if isinstance(adata.layers[layer], sparse.spmatrix):
                adata.layers[layer] = adata.layers[layer].tocsc()

        # All data from secondary_analysis is scaled at the moment to zero-mean unit-variance
        # https://github.com/hubmapconsortium/salmon-rnaseq/blob/master/bin/analysis/scanpy_entry_point.py#L47
        # We currently cannot visaulize this in Vitessce so we replace `X` with the log-normalized raw counts:
        # https://github.com/hubmapconsortium/salmon-rnaseq/commit/9cf1dd4dbe4538b565a0355f56399d3587827eff
        # Ideally, we should be able to manage the `layers` and `X` simultaneously in `zarr` but currently we cannot:
        # https://github.com/theislab/anndata/issues/524
        if (SECONDARY_ANALYSIS == h5ad_file):
            adata.layers['scaled'] = adata.X.copy()
            adata.X = adata.layers['unscaled'].copy()
        zarr_path = output_dir / (Path(h5ad_file).stem + ".zarr")

        # If the matrix is sparse, it's best for performance to
        # use non-sparse formats to keep the portal responsive.
        # In the future, we should be able to use CSC sparse data natively
        # and get equal performance:
        # https://github.com/theislab/anndata/issues/524
        if isinstance(adata.X, sparse.spmatrix):
            adata.X = adata.X.todense()
        adata.write_zarr(zarr_path, [adata.shape[0], VAR_CHUNK_SIZE])
Ejemplo n.º 29
0
    def __init__(self,
                 data_path,
                 num_gene,
                 shared_gene_mask=None,
                 filter_mask=None,
                 normalized=True):
        self.data_path = data_path
        self.num_gene = num_gene
        self.gene_mask = shared_gene_mask
        self.filter_mask = filter_mask
        self.normalized = normalized
        self.anndata = anndata.read_h5ad(data_path)
        self.num_class = len(self.anndata.obs['labels'].unique().tolist())

        if self.filter_mask is None:
            self.filter_mask, _ = sc.pp.filter_genes(self.anndata,
                                                     min_counts=1,
                                                     inplace=False)
        else:
            print("use share filter")
        self.anndata = self.anndata[:, self.filter_mask]

        if self.gene_mask is None:
            self.gene_mask = self.select_gene(data=self.anndata.X,
                                              num_gene=self.num_gene)
        else:
            print("use share gene")
        if self.normalized:
            anndata_norm = self.anndata.copy()
            sc.pp.normalize_per_cell(anndata_norm,
                                     counts_per_cell_after=1_000_000)
            sc.pp.log1p(anndata_norm)
            anndata_norm.X = anndata_norm.X.toarray()
            anndata_norm.X -= anndata_norm.X.mean(axis=0)
            anndata_norm.X /= anndata_norm.X.std(axis=0)
            if np.isnan(anndata_norm.X).any():
                print("Detect nan, fix by nan to num")
                anndata_norm.X = np.nan_to_num(anndata_norm.X)
                assert (not np.isnan(anndata_norm.X).any())

            self.anndata_preprocessed = anndata_norm[:, self.gene_mask].copy()
        else:
            self.anndata_preprocessed = self.anndata[:, self.gene_mask].copy()
        self.X = torch.tensor(self.anndata_preprocessed.X)
        self.id_to_batch, self.batch_to_id = self.get_batch_map()
        self.id_to_cell, self.cell_to_id = self.get_cell_map()

        self.cell_label_tensor = torch.tensor(
            [self.cell_to_id[e] for e in self.anndata.obs['labels']])
        self.batch_id_tensor = torch.tensor(
            [self.batch_to_id[e] for e in self.anndata.obs['batch_id']])
    def test_load_to_h5ad(self):

        with tempfile.TemporaryDirectory() as tmpdir:

            dname = os.path.join(tmpdir, "dense.h5ad")
            sname = os.path.join(tmpdir, "sparse.h5ad")

            self.workflow.output_dir = tmpdir
            self.workflow.load_data_and_save_h5ad("dense.h5ad")

            data = ad.read_h5ad(dname)
            npt.assert_array_almost_equal_nulp(data.X,
                                               self.workflow.data.values)
            os.remove(dname)

            self.workflow.load_data_and_save_h5ad("sparse.h5ad",
                                                  to_sparse=True)

            data = ad.read_h5ad(sname)
            self.assertTrue(sps.isspmatrix_csr(data.X))
            npt.assert_array_almost_equal_nulp(data.X.A,
                                               self.workflow.data.values.A)
            os.remove(sname)