def load_palantir_data(smoothed=False): fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad' an = anndata.read_h5ad(fn) genes = an.var_names cells = an.obs_names if not smoothed: counts = singlet.CountsTable( data=an.raw.X.todense().T, index=genes, columns=cells, ) else: counts = singlet.CountsTable( data=an.obsm['MAGIC_imputed_data'].T, index=genes, columns=cells, ) ss = singlet.SampleSheet(an.obs) ss['tsne_1'] = an.obsm['tsne'][:, 0] ss['tsne_2'] = an.obsm['tsne'][:, 1] ss['clusters'] = ss['clusters'].astype(str) ds = singlet.Dataset( counts_table=counts, samplesheet=ss, ) return ds
def load_palantir_data(smoothed=False): fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad' an = anndata.read_h5ad(fn) genes = an.var_names cells = an.obs_names if not smoothed: counts = singlet.CountsTable( data=an.raw.X.todense().T, index=genes, columns=cells, ) else: counts = singlet.CountsTable( data=an.obsm['MAGIC_imputed_data'].T, index=genes, columns=cells, ) ss = singlet.SampleSheet(an.obs) ss['tsne_1'] = an.obsm['tsne'][:, 0] ss['tsne_2'] = an.obsm['tsne'][:, 1] ss['clusters'] = ss['clusters'].astype(str) ds = singlet.Dataset( counts_table=counts, samplesheet=ss, ) ds.samplesheet['Cell Subtype'] = ds.samplesheet['clusters'].replace({ '0': 'HSC', '1': 'HSC', '2': 'Ery-precursor', '3': 'Mono', '4': 'Mono-precursor', '5': 'CLP', '6': 'Mono', '7': 'pDC', '8': 'Ery', '9': 'Mega', }) return ds
def load_our_data(): ds = singlet.Dataset(dataset={ 'path': '../../data/sequencing/me1/with_gene_names.loom', 'index_samples': 'CellID', 'index_features': 'GeneName' }, ) ds.samplesheet['coverage'] = ds.counts.sum(axis=0) ds.samplesheet['n_genes'] = (ds.counts >= 1).sum(axis=0) ds.featuresheet['exp_avg'] = ds.counts.mean(axis=1) return ds
sys.path.append('/home/fabio/university/postdoc/singlet') os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml' import singlet if __name__ == '__main__': fdn = '../../data/sequencing/me1/' fn_dataset = fdn+'raw.loom' ds = singlet.Dataset( dataset={ 'path': fn_dataset, 'index_samples': 'CellID', 'index_features': 'EnsemblID', }, ) conv = pd.read_csv( '../../data/gene_ensemblId_name.tsv', sep='\t', index_col=0, squeeze=True, ) print('Restrict to features with a gene name') gids = ds.featurenames gids = gids[gids.isin(conv.index)] ds.query_features_by_name(gids, inplace=True)
datasetd = { 'cd137': 'anti-CD137_7dpi', 'isotype_control': 'isotype_control', 'uninfected': 'M_GV-Na_ve-Na_ve', } pa = argparse.ArgumentParser() pa.add_argument('--sample', required=True, choices=datasetnames) pa.add_argument('--normalized', action='store_true') args = pa.parse_args() dn = args.sample print('Load {:} data from loom file'.format(dn)) sn = datasetd[dn] if args.normalized: ds = singlet.Dataset(dataset=dn + '_cpm') # FIXME ds.counts = singlet.CountsTable(ds.counts) ds.counts._normalized = 'counts_per_million' else: ds = singlet.Dataset(dataset=dn) ds.samplesheet['coverage'] = ds.counts.sum(axis=0) print('Normalize cpm') normg = ds.samplesheet['coverage'] / 1000000 ds.counts = singlet.CountsTable(ds.counts / normg) ds.counts._normalized = 'counts_per_million' print('Save to normalized loom file') ds.to_dataset_file(
enh['LMO2'] = enh['Ebox_motifs'] enh = enh[tfs] enh.columns.name = 'Motif' return enh if __name__ == '__main__': fn_ds = '../../data/sequencing/me1/normalized_7tfs.loom' if not os.path.isfile(fn_ds): print('Read loom raw file') fdn = '../../data/sequencing/me1/' fn_dataset = fdn + 'raw.loom' ds = singlet.Dataset(dataset={ 'path': fn_dataset, 'index_samples': 'CellID', 'index_features': 'EnsemblID', }, ) ds.samplesheet['coverage'] = ds.counts.sum(axis=0) ds.samplesheet['n_genes'] = (ds.counts >= 1).sum(axis=0) ds.featuresheet['exp_avg'] = ds.counts.mean(axis=1) ds.counts.normalize('counts_per_ten_thousand', inplace=True) tmp = ds.featuresheet.loc[ds.featuresheet['GeneName'].isin(tfs), 'GeneName'] dic = {val: key for key, val in tmp.items()} idx = [dic[val] for val in tfs] dst = ds.query_features_by_name(idx) dst.reindex('features', 'GeneName', inplace=True)
sys.path.append('/home/fabio/university/postdoc/singlet') import singlet if __name__ == '__main__': fig_fdn = '../../figures' pa = argparse.ArgumentParser() pa.add_argument('--save', action='store_true') args = pa.parse_args() print('Load data') data_fdn = '../../data/sequencing' fn_normalised = f'{data_fdn}/normalised.h5ad' ds = singlet.Dataset(dataset={ 'path': fn_normalised, }) print('Loaded') print('Load umap') fn_umap = f'{data_fdn}/umap.tsv' vs = pd.read_csv(fn_umap, sep='\t', index_col=0) vs = vs.loc[ds.samplenames] ds.obs['umap1'] = vs['umap1'] ds.obs['umap2'] = vs['umap2'] ds.obs['leiden'] = vs['leiden'].astype(str) print('Sort by average Runx1 expression') dsa = ds.average('samples', by='leiden') ds.obs['cluster_new'] = ds.obs['leiden'].map({ '3': '0',
print('Export to loom file') col_attrs = {col: metaf[col].values for col in metaf.columns} col_attrs['CellID'] = metaf.index.values row_attrs = {'GeneName': counts_gn.index.values} loompy.create( fn_loom, counts_gn.values, col_attrs=col_attrs, row_attrs=row_attrs, ) print('Load back loom file to check umap') ds = singlet.Dataset( dataset={ 'path': fn_loom, 'index_samples': 'CellID', 'index_features': 'GeneName', }) features = ds.feature_selection.overdispersed_within_groups('sample') dsf = ds.query_features_by_name(features) dsc = dsf.dimensionality.pca(n_dims=30, return_dataset='samples') vs = dsc.dimensionality.umap() cus = ds.samplesheet['cluster'].unique() cmap = dict(zip(cus, sns.color_palette('husl', n_colors=len(cus)))) fig, ax = plt.subplots(figsize=(6, 4)) for cu in cus: x, y = vs.loc[ds.samplesheet['cluster'] == cu].values.T ax.scatter(x, y, s=30, color=cmap[cu], alpha=0.6, label=cu) ax.legend( loc='upper left',
genes = an.var_names cells = an.obs_names counts = singlet.CountsTable( data=an.X.T, index=genes, columns=cells, ) ss = singlet.SampleSheet(an.obs) ss['tsne_1'] = an.obsm['tsne'][:, 0] ss['tsne_2'] = an.obsm['tsne'][:, 1] ss['clusters'] = ss['clusters'].astype(str) ds = singlet.Dataset( counts_table=counts, samplesheet=ss, ) print('Get MAGIC smoothed data') counts = singlet.CountsTable( data=an.obsm['MAGIC_imputed_data'].T, index=genes, columns=cells, ) dsM = singlet.Dataset( counts_table=counts, samplesheet=ss, ) print('Plot t-SNEs from their metadata') genes = [