def load_epi_TMS(average='geometric'): from singlet import Dataset data_fn = '../../data/tabulamurissenis/lung_epi_droplet.h5ad' dsepi = Dataset( dataset={ 'path': data_fn, 'index_samples': 'obs_names', 'index_features': 'var_names', 'bit_precision': 32, }) dsepi.samplesheet['Timepoint'] = 'Adult' dsepi.samplesheet['cellSubtype'] = dsepi.samplesheet[ 'free_annotation'].replace({ 'Alveolar Epithelial Type 2': 'AT2', 'Alveolar Epithelial Type 1': 'AT1', }) dsepi.counts.normalize(inplace=True) if not average: return dsepi if average == 'geometric': dsepi.counts.log(inplace=True) dsepi_av = dsepi.average('samples', by=['cellSubtype', 'Timepoint']) return dsepi_av
def load_epi_development(average='geometric'): from singlet import Dataset data_fn = '../../data/Cohen_et_al_2018/cohen2018_lung_epi_cpm.loom' dsepi = Dataset( dataset={ 'path': data_fn, 'index_samples': 'obs_names', 'index_features': 'var_names', 'bit_precision': 32, }) dsepi.counts.normalize(inplace=True) if not average: return dsepi if average == 'geometric': dsepi.counts.log(inplace=True) dsepi_av = dsepi.average('samples', by=['cellSubtype', 'Timepoint']) return dsepi_av
fdn_data = '../../data/sequencing/datasets/endo_final/' fns_loom = { 'ours': fdn_data + 'endo_normoxia.loom', 'tabulamuris': '../../data/tabulamuris/FACS_alltissues/endos.loom', } fig_fdn = '../../figures/endomese_share/endo_paper_figure_2/' if __name__ == '__main__': os.makedirs(fig_fdn, exist_ok=True) print('Load our endo data') ds = Dataset(dataset={ 'path': fns_loom['ours'], 'index_samples': '_index', 'index_features': 'GeneName', }, ) ds.samplesheet['Tissue'] = 'Lung' ds.samplesheet.index.name = 'CellID' ds.counts.columns.name = 'CellID' ds.samplesheet['Dataset'] = 'ours' print('Load Tabula Muris FACS endothelial') ds_tm = Dataset(dataset={ 'path': fns_loom['tabulamuris'], 'index_samples': 'CellID', 'index_features': 'GeneName', }, ) ds_tm.samplesheet['Tissue'] = ds_tm.samplesheet['tissue'] ds_tm.samplesheet['Mousename'] = 'TM' + ds_tm.samplesheet['mouse.id']
) if __name__ == '__main__': if not os.path.isfile(fns_loom['ours']): create_loom_from_figshare(fns_loom['ours']) if not os.path.isfile(fns_loom['tabulamuris']): ss = create_loom_from_tabulamurisfacs(fns_loom['tabulamuris']) print('Load loom file') ds = Dataset( dataset={ 'path': fns_loom['ours'], 'index_samples': 'CellID', 'index_features': 'GeneName', }, ) ds.samplesheet['cellSubtype'] = ds.samplesheet['Cell Subtype'] ds.query_samples_by_metadata( 'cellSubtype in ("Mac I", "Mac II", "Mac III", "Mac IV", "Mac V")', inplace=True) print('Load Schyns et al loom file') ds_sc = Dataset( dataset={ 'path': fns_loom['schyns'], 'index_samples': 'CellID', 'index_features': 'Gene', 'bit_precision': 32,
'pancreas', 'skin', 'spleen', 'thymus', 'tongue', 'trachea', ] if __name__ == '__main__': pa = argparse.ArgumentParser( description='Test loading various tabula muris tissues', ) pa.add_argument( '--tissues', default=tissues, nargs='+', choices=tissues, help='Tissue of origin', ) args = pa.parse_args() dss = {} for tissue in args.tissues: print(tissue) ds = Dataset( counts_table=tissue, samplesheet='alltissues', featuresheet='alltissues', ) dss[tissue] = ds
print('Parse gene metadata') fn_pbmc_meta_genes = '../data/pbmc_zanini/featuresheet_10_10_unique_L1.tsv' featuresheet = pd.read_csv(fn_pbmc_meta_genes, sep='\t', index_col=0) c = Counter(featuresheet['GeneName']) ind = [] for fea in featuresheet.index: if c[featuresheet.at[fea, 'GeneName']] == 1: ind.append(fea) featuresheet = featuresheet.loc[ind] counts = counts.loc[ind] print('Reannotate cell types') ds = Dataset( samplesheet=cells, counts_table=counts, featuresheet=featuresheet, ) ds.reindex(axis='features', column='GeneName', inplace=True) print('Filter low-quality cells') ds.samplesheet['coverage'] = ds.samplesheet['coverage'].astype(int) ds.query_samples_by_metadata('coverage >= 50000', inplace=True) print('Ignore HLA types and TCR and BCR variable regions') def fun(fn): if fn in ('HLA-A', 'HLA-B', 'HLA-C'): return False if fn.startswith('TRBV'): return False
n_genes = (counts_table.iloc[len(counts_table._spikeins ):-len(counts_table._otherfeatures)] >= 1).sum(axis=0) samplesheet['number_of_genes_1plusreads'] = n_genes print('Exclude slashes for loom') featuresheet.rename(columns={'Chromosome/scaffold name': 'Chromosome'}, inplace=True) print('Add index name for loom') samplesheet.index.name = 'CellID' counts_table.columns.name = 'CellID' ds = Dataset( counts_table=counts_table, featuresheet=featuresheet, samplesheet=samplesheet, ) print('Save raw dataset as loom file') fn_raw = '../../data/sequencing/datasets/all_{:}/raw.loom'.format( version) if not args.dry: ds.to_dataset_file(fn_raw, fmt='loom') # BLOCK 2: filter genes and cells if args.block == 2: print('Load raw loom file') import loompy fn_raw = '../../data/sequencing/datasets/all_{:}/raw.loom'.format(
print('Feature selection') features = ds.feature_selection.overdispersed_within_groups('Mousename', inplace=False) dsf = ds.query_features_by_name(features) print('PCA') dsc = dsf.dimensionality.pca(n_dims=30, robust=False, return_dataset='samples') print('Smoothen along knn') edges = dsc.graph.knn('samples', 20, return_kind='edges') # Smoothen twice (reaching second-order neighbors) counts_smooth = ds.counts.smoothen_neighbors(edges, n_iterations=2) ds_smooth = Dataset( counts_table=counts_smooth, samplesheet=ds.samplesheet.copy(), ) print('Plot known signaling genes, smoothened') genes = [g for g in genes_all if (g in ds.featurenames)] markers = list(genes) + [ 'cellSubtype', ] fig, axs = plt.subplots(4, 5, figsize=(12, 8), sharex=True, sharey=True) axs = axs.ravel() for i, (gene, ax) in enumerate(zip(markers, axs)): if gene == 'cellSubtype': cmap = { 'Mac IV': 'darkolivegreen', 'Mac II': 'lime',
row_attrs=row_attrs, ) if __name__ == '__main__': if not os.path.isfile(fns_loom['ours']): create_loom_from_figshare(fns_loom['ours']) if not os.path.isfile(fns_loom['tabulamuris']): ss = create_loom_from_tabulamurisfacs(fns_loom['tabulamuris']) print('Load loom file') ds = Dataset(dataset={ 'path': fns_loom['ours'], 'index_samples': 'CellID', 'index_features': 'GeneName', }, ) ds.samplesheet['Tissue'] = 'lung' print('Load Tabula Muris FACS immune') ds_tm = Dataset(dataset={ 'path': fns_loom['tabulamuris'], 'index_samples': 'CellID', 'index_features': 'GeneName', }, ) ds_tm.samplesheet['Tissue'] = ds_tm.samplesheet['tissue'] ds_tm.samplesheet['Cell Subtype'] = ds_tm.samplesheet[ 'cell_ontology_class'] ds_tm.samplesheet['Cell Subtype'].replace( {