Beispiel #1
0
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    print('Limit to decently expressed genes')
    ind = (ds.counts > args.n_cpm_min_genes[0]).sum(
        axis=1) >= args.n_cpm_min_genes[1]
    ds.counts = ds.counts.loc[ind]

    print('Ignore genes with multiple IDs')
    from collections import Counter
    genec = Counter(ds.featuresheet['GeneName'].values)
    genes_multiple = [k for k, v in genec.items() if v > 1]
    ds.featuresheet = ds.featuresheet.loc[~ds.featuresheet['GeneName'].
                                          isin(genes_multiple)]

    print('Translate to gene names')
Beispiel #2
0
        counts_table='dengue',
        samplesheet='virus',
        featuresheet='humanGC38',
    )
    ds.query_samples_by_counts('total >= 50000', inplace=True)

    ds.samplesheet.rename(columns={'time [h]': 'time'}, inplace=True)
    cov = ds.samplesheet['coverage'] = ds.counts.sum(axis=0)
    ds.counts.normalize('counts_per_million', inplace=True)

    n = ds.samplesheet['numberDengueReads'].astype(int)
    ds.samplesheet['virus_reads_per_million'] = 1e6 * n / (cov + n)
    ds.counts.log(inplace=True)

    # Only select cells without virus
    ds.query_samples_by_metadata('virus_reads_per_million < 0.1', inplace=True)

    # Check table with number of cells
    table = (ds.samplesheet.groupby(
        ['time', 'MOI']).count().iloc[:,
                                      0].unstack().fillna(0).astype(int).loc[[
                                          '4', '12', '24', '48'
                                      ]])

    print('Selecting only early 2 time points')
    # The rest has too few uninfected cells
    ds.query_samples_by_metadata('time in ["4", "12"]', inplace=True)
    dsm = ds.split('MOI')
    ks = dsm['0'].compare(dsm['1'])['P-value']

    # Get the top hits for GO analysis
Beispiel #3
0
date:       07/08/17
content:    Test Dataset class.
'''
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.dataset import Dataset
    ds = Dataset(
            samplesheet='example_sheet_tsv',
            counts_table='example_table_tsv')

    print('Query samples by metadata')
    ds_tmp = ds.query_samples_by_metadata(
            'experiment == "test_pipeline"',
            inplace=False)
    assert(tuple(ds_tmp.samplenames) == ('test_pipeline',))
    print('Done!')

    print('Query sample by counts in one gene')
    ds_tmp = ds.query_samples_by_counts('KRIT1 > 100', inplace=False)
    assert(tuple(ds_tmp.samplenames) == ('third_sample',))
    print('Done!')

    print('Query sample by total counts')
    ds_tmp = ds.query_samples_by_counts('total < 3000000', inplace=False)
    assert(tuple(ds_tmp.samplenames) == ('second_sample',))
    print('Done!')

    print('Query sample by mapped counts')
Beispiel #4
0
            sep='\t',
            index_col=0,
            dtype={0: str},
        ))

    print('Build dataset')
    ds = Dataset(
        counts_table=counts,
        featuresheet=featuresheet,
        samplesheet=samplesheet,
    )

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    if not args.keep2:
        print('Filter out sample 2-uninfected (low quality)')
        ds.query_samples_by_metadata('biosample != "2-uninfected"',
                                     inplace=True)

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Limit to decently expressed genes')
    ind = (ds.counts >= 10).sum(axis=1) >= 10
Beispiel #5
0
                '$10^5$',
            ])
            ax.set_ylabel('{:} per million reads'.format(gname))
            ax.set_xlabel('')
            for tk in ax.get_xticklabels():
                tk.set_rotation(300)

        return fig

    # QC all cells
    fig = plot_qcs(ds)
    fig.suptitle('All cells')
    plt.tight_layout(rect=(0, 0, 1, 0.96))

    # Filter cells and reQC
    n_reads_min = args.n_reads_min
    dsgood = ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                          local_dict=locals())
    fig = plot_qcs(dsgood)
    fig.suptitle('Only cells with {:d}+ reads'.format(n_reads_min))
    plt.tight_layout(rect=(0, 0, 1, 0.96))

    ## Probably not worth saving, we only trash ~2,500 out of ~8,700 cells
    #if args.save:
    #    dsgood.counts.to_csv(
    #        '../../data/mouse_mCMV_1/all/dataframe_n_reads_min_{:}.tsv'.format(n_reads_min),
    #        index=True, sep='\t')

    plt.ion()
    plt.show()
Beispiel #6
0
                            sharex=True,
                            sharey=True,
                            figsize=(8, 4))
    dsr.plot.scatter_reduced_samples(vs,
                                     color_by='dbscan',
                                     ax=axs[0],
                                     zorder=10)
    dsr.plot.scatter_reduced_samples(vs,
                                     color_by='kmeans',
                                     ax=axs[1],
                                     zorder=10)

    axs[0].set_title('DBSCAN')
    axs[1].set_title('K-means, 7 clusters')

    plt.tight_layout()

    ds.samplesheet['cluster'] = dsr.samplesheet['kmeans']
    ds_dict = ds.split(phenotypes=['cluster'])

    genes_by_cluster = {}
    for key, dsi in ds_dict.items():
        dso = ds.query_samples_by_metadata('cluster!=@key',
                                           local_dict=locals())
        genes_by_cluster[key] = dsi.compare(dso)['P-value'].nsmallest(10).index
    assert (genes_by_cluster[1][:3].tolist() == [
        'ENSG00000138085', 'ENSG00000184076', 'ENSG00000116459'
    ])

    plt.show()
Beispiel #7
0
    pa.add_argument('--keep2',
                    action='store_true',
                    help='Keep sample 2-uninfected despite low quality')
    args = pa.parse_args()

    print('Load dataset')
    ds = Dataset(
        counts_table='combined',
        featuresheet='combined',
        samplesheet='combined',
    )

    print('Filter low-quality cells')
    n_reads_min = args.n_reads_min
    ds.query_samples_by_metadata('n_reads > @n_reads_min',
                                 local_dict=locals(),
                                 inplace=True)

    if not args.keep2:
        print('Filter out sample 2-uninfected (low quality)')
        ds.query_samples_by_metadata('biosample != "2-uninfected"',
                                     inplace=True)

    print('Add normalized virus counts')
    ds.samplesheet['virus_reads_per_million'] = 1e6 * ds.samplesheet[
        'n_reads_virus'] / ds.samplesheet['n_reads']
    ds.samplesheet['log_virus_reads_per_million'] = np.log10(
        0.1 + ds.samplesheet['virus_reads_per_million'])

    print('Limit to decently expressed genes')
    ind = (ds.counts >= 10).sum(axis=1) >= 10