Python Dataset Examples, singlet.Dataset Python Examples

Example #1

0

Show file

def load_palantir_data(smoothed=False):
    fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad'
    an = anndata.read_h5ad(fn)

    genes = an.var_names
    cells = an.obs_names

    if not smoothed:
        counts = singlet.CountsTable(
            data=an.raw.X.todense().T,
            index=genes,
            columns=cells,
        )
    else:
        counts = singlet.CountsTable(
            data=an.obsm['MAGIC_imputed_data'].T,
            index=genes,
            columns=cells,
        )

    ss = singlet.SampleSheet(an.obs)
    ss['tsne_1'] = an.obsm['tsne'][:, 0]
    ss['tsne_2'] = an.obsm['tsne'][:, 1]
    ss['clusters'] = ss['clusters'].astype(str)

    ds = singlet.Dataset(
            counts_table=counts,
            samplesheet=ss,
        )
    return ds

Example #2

0

Show file

def load_palantir_data(smoothed=False):
    fn = '../../data/external/Palantir/human_cd34_bm_rep1.h5ad'
    an = anndata.read_h5ad(fn)

    genes = an.var_names
    cells = an.obs_names

    if not smoothed:
        counts = singlet.CountsTable(
            data=an.raw.X.todense().T,
            index=genes,
            columns=cells,
        )
    else:
        counts = singlet.CountsTable(
            data=an.obsm['MAGIC_imputed_data'].T,
            index=genes,
            columns=cells,
        )

    ss = singlet.SampleSheet(an.obs)
    ss['tsne_1'] = an.obsm['tsne'][:, 0]
    ss['tsne_2'] = an.obsm['tsne'][:, 1]
    ss['clusters'] = ss['clusters'].astype(str)

    ds = singlet.Dataset(
        counts_table=counts,
        samplesheet=ss,
    )

    ds.samplesheet['Cell Subtype'] = ds.samplesheet['clusters'].replace({
        '0':
        'HSC',
        '1':
        'HSC',
        '2':
        'Ery-precursor',
        '3':
        'Mono',
        '4':
        'Mono-precursor',
        '5':
        'CLP',
        '6':
        'Mono',
        '7':
        'pDC',
        '8':
        'Ery',
        '9':
        'Mega',
    })

    return ds

Example #3

0

Show file

def load_our_data():
    ds = singlet.Dataset(dataset={
        'path': '../../data/sequencing/me1/with_gene_names.loom',
        'index_samples': 'CellID',
        'index_features': 'GeneName'
    }, )

    ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
    ds.samplesheet['n_genes'] = (ds.counts >= 1).sum(axis=0)
    ds.featuresheet['exp_avg'] = ds.counts.mean(axis=1)

    return ds

Example #4

0

Show file

sys.path.append('/home/fabio/university/postdoc/singlet')
os.environ['SINGLET_CONFIG_FILENAME'] = 'singlet.yml'
import singlet





if __name__ == '__main__':

    fdn = '../../data/sequencing/me1/'
    fn_dataset = fdn+'raw.loom'
    ds = singlet.Dataset(
        dataset={
            'path': fn_dataset,
            'index_samples': 'CellID',
            'index_features': 'EnsemblID',
            },
        )

    conv = pd.read_csv(
            '../../data/gene_ensemblId_name.tsv',
            sep='\t',
            index_col=0,
            squeeze=True,
            )

    print('Restrict to features with a gene name')
    gids = ds.featurenames
    gids = gids[gids.isin(conv.index)]
    ds.query_features_by_name(gids, inplace=True)

Example #5

0

Show file

    datasetd = {
        'cd137': 'anti-CD137_7dpi',
        'isotype_control': 'isotype_control',
        'uninfected': 'M_GV-Na_ve-Na_ve',
    }

    pa = argparse.ArgumentParser()
    pa.add_argument('--sample', required=True, choices=datasetnames)
    pa.add_argument('--normalized', action='store_true')
    args = pa.parse_args()
    dn = args.sample

    print('Load {:} data from loom file'.format(dn))
    sn = datasetd[dn]
    if args.normalized:
        ds = singlet.Dataset(dataset=dn + '_cpm')
        # FIXME
        ds.counts = singlet.CountsTable(ds.counts)
        ds.counts._normalized = 'counts_per_million'
    else:
        ds = singlet.Dataset(dataset=dn)

        ds.samplesheet['coverage'] = ds.counts.sum(axis=0)

        print('Normalize cpm')
        normg = ds.samplesheet['coverage'] / 1000000
        ds.counts = singlet.CountsTable(ds.counts / normg)
        ds.counts._normalized = 'counts_per_million'

        print('Save to normalized loom file')
        ds.to_dataset_file(

Example #6

0

Show file

    enh['LMO2'] = enh['Ebox_motifs']
    enh = enh[tfs]
    enh.columns.name = 'Motif'
    return enh


if __name__ == '__main__':

    fn_ds = '../../data/sequencing/me1/normalized_7tfs.loom'
    if not os.path.isfile(fn_ds):
        print('Read loom raw file')
        fdn = '../../data/sequencing/me1/'
        fn_dataset = fdn + 'raw.loom'
        ds = singlet.Dataset(dataset={
            'path': fn_dataset,
            'index_samples': 'CellID',
            'index_features': 'EnsemblID',
        }, )

        ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
        ds.samplesheet['n_genes'] = (ds.counts >= 1).sum(axis=0)
        ds.featuresheet['exp_avg'] = ds.counts.mean(axis=1)

        ds.counts.normalize('counts_per_ten_thousand', inplace=True)

        tmp = ds.featuresheet.loc[ds.featuresheet['GeneName'].isin(tfs),
                                  'GeneName']
        dic = {val: key for key, val in tmp.items()}
        idx = [dic[val] for val in tfs]
        dst = ds.query_features_by_name(idx)
        dst.reindex('features', 'GeneName', inplace=True)

Example #7

0

Show file

sys.path.append('/home/fabio/university/postdoc/singlet')
import singlet

if __name__ == '__main__':

    fig_fdn = '../../figures'

    pa = argparse.ArgumentParser()
    pa.add_argument('--save', action='store_true')
    args = pa.parse_args()

    print('Load data')
    data_fdn = '../../data/sequencing'
    fn_normalised = f'{data_fdn}/normalised.h5ad'
    ds = singlet.Dataset(dataset={
        'path': fn_normalised,
    })
    print('Loaded')

    print('Load umap')
    fn_umap = f'{data_fdn}/umap.tsv'
    vs = pd.read_csv(fn_umap, sep='\t', index_col=0)
    vs = vs.loc[ds.samplenames]
    ds.obs['umap1'] = vs['umap1']
    ds.obs['umap2'] = vs['umap2']
    ds.obs['leiden'] = vs['leiden'].astype(str)

    print('Sort by average Runx1 expression')
    dsa = ds.average('samples', by='leiden')
    ds.obs['cluster_new'] = ds.obs['leiden'].map({
        '3': '0',

Example #8

0

Show file

        print('Export to loom file')
        col_attrs = {col: metaf[col].values for col in metaf.columns}
        col_attrs['CellID'] = metaf.index.values
        row_attrs = {'GeneName': counts_gn.index.values}
        loompy.create(
            fn_loom,
            counts_gn.values,
            col_attrs=col_attrs,
            row_attrs=row_attrs,
        )

        print('Load back loom file to check umap')
        ds = singlet.Dataset(
            dataset={
                'path': fn_loom,
                'index_samples': 'CellID',
                'index_features': 'GeneName',
            })
        features = ds.feature_selection.overdispersed_within_groups('sample')
        dsf = ds.query_features_by_name(features)
        dsc = dsf.dimensionality.pca(n_dims=30, return_dataset='samples')
        vs = dsc.dimensionality.umap()

        cus = ds.samplesheet['cluster'].unique()
        cmap = dict(zip(cus, sns.color_palette('husl', n_colors=len(cus))))
        fig, ax = plt.subplots(figsize=(6, 4))
        for cu in cus:
            x, y = vs.loc[ds.samplesheet['cluster'] == cu].values.T
            ax.scatter(x, y, s=30, color=cmap[cu], alpha=0.6, label=cu)
        ax.legend(
            loc='upper left',

Example #9

0

Show file

    genes = an.var_names
    cells = an.obs_names

    counts = singlet.CountsTable(
        data=an.X.T,
        index=genes,
        columns=cells,
    )
    ss = singlet.SampleSheet(an.obs)
    ss['tsne_1'] = an.obsm['tsne'][:, 0]
    ss['tsne_2'] = an.obsm['tsne'][:, 1]
    ss['clusters'] = ss['clusters'].astype(str)

    ds = singlet.Dataset(
        counts_table=counts,
        samplesheet=ss,
    )

    print('Get MAGIC smoothed data')
    counts = singlet.CountsTable(
        data=an.obsm['MAGIC_imputed_data'].T,
        index=genes,
        columns=cells,
    )
    dsM = singlet.Dataset(
        counts_table=counts,
        samplesheet=ss,
    )

    print('Plot t-SNEs from their metadata')
    genes = [