Ejemplo n.º 1
0
def get_dataset(tissue, membrane_only=True):
    counts = parse_counts(tissue)
    if membrane_only:
        go = parse_go_plasma_membrane()
        genes_membrane = go[go.isin(counts.index)]
        counts = counts.loc[genes_membrane]

    ds = Dataset(
        samplesheet=SampleSheet(cell_types),
        counts_table=CountsTable(counts),
    )
    return ds
Ejemplo n.º 2
0
def parse_counts(tissue, regenerate=False):
    import glob
    if 'annotation glob' in config[tissue]:
        glb = config[tissue]['annotation glob']
    else:
        glb = tissue

    if regenerate:
        cglbs = ('CountTable', )
    else:
        cglbs = ('CountTableNormalized', 'CountTable')

    for cglb in cglbs:
        fn_glb = '../../data/MACAtSNE/{:}{:}.csv'.format(glb, cglb)
        fns = glob.glob(fn_glb)
        if len(fns):
            break

    if len(fns) == 0:
        raise IOError('Counts file not found for tissue: {:}'.format(tissue))
    elif len(fns) > 1:
        raise IOError(
            'Several counts files found for tissue: {:}'.format(tissue))
    else:
        fn = fns[0]

    out = pd.read_csv(fn, sep=',', index_col=0)
    if '.' in out.columns[0]:
        out.columns = [
            '{1}_{0}'.format(*(c.split('.')[:2])) for c in out.columns
        ]
    out.index.name = 'GeneName'
    out.columns.name = 'Cell'
    out = CountsTable(out)
    if 'Normalized' in fn:
        out._normalized = 'counts_per_million'
    else:
        print('Normalize counts')
        out.normalize(inplace=True)
        print('Log counts')
        out.log(inplace=True)
        print('Write normalized counts to file')
        out.to_csv(fn[:-4] + 'Normalized.csv', sep=',')
    return out
Ejemplo n.º 3
0
def ct():
    from singlet.counts_table import CountsTable
    return CountsTable.from_tablename('example_table_tsv').iloc[:200]
Ejemplo n.º 4
0
def get_dataset(tissue,
                membrane_only=True,
                regenerate=False,
                go_contains=None,
                go_exclude=None):

    # Some tissues like brain were split for sorting, we merge them here
    dss = []
    for tissue_facs in tissues_prediction[tissue]:
        cell_types, plates = parse_annotations(tissue_facs)
        counts = parse_counts(tissue_facs, regenerate=regenerate)
        if membrane_only:
            go = parse_go_plasma_membrane().index
            genes_membrane = go[go.isin(counts.index)]
            counts = counts.loc[genes_membrane]

        if (go_contains is not None) and (go_exclude is not None):
            raise ValueError('Use either go_contains or go_exclude')
        if go_contains is not None:
            go = parse_go_plasma_membrane()
            genes = go.index[go['GONames'].str.contains(go_contains)]
            genes = np.intersect1d(genes, counts.index)
            counts = counts.loc[genes]
        elif go_exclude is not None:
            go = parse_go_plasma_membrane()
            genes = go.index[~go['GONames'].str.contains(go_exclude)]
            genes = np.intersect1d(genes, counts.index)
            counts = counts.loc[genes]

        dss.append({'samplesheet': cell_types, 'counts': counts})

    if len(dss) == 1:
        ds = Dataset(
            samplesheet=SampleSheet(cell_types),
            counts_table=counts,
        )
        return ds
    else:
        # Merging is kind of messy because some genes are absent from either
        # subtissue (grrr); I put zeroes for now, Michelle is working on the
        # better solution (we have those numbers somewhere)
        genes = set()
        for ds in dss:
            genes |= set(ds['counts'].index.values)
        genes = pd.Index(sorted(genes), name=ds['counts'].index.name)
        for ds in dss:
            genes_missing = genes[~genes.isin(ds['counts'].index)]
            for gene in genes_missing:
                # The stuff is normalized, pseudocounted, and logged
                ds['counts'].loc[gene] = -1.0
            ds['counts'] = ds['counts'].loc[genes]
        ngenes = len(genes)
        ncells = sum(ds['samplesheet'].shape[0] for ds in dss)
        samplesheet_all = pd.concat([ds['samplesheet'] for ds in dss], axis=0)
        counts_all = pd.DataFrame(np.zeros((ngenes, ncells), float),
                                  index=genes,
                                  columns=samplesheet_all.index)
        for ds in dss:
            counts_all.loc[:,
                           ds['counts'].columns.values] = ds['counts'].values
        counts_all = CountsTable(counts_all)
        if ds['counts']._normalized:
            counts_all._normalized = ds['counts']._normalized

        ds = Dataset(
            samplesheet=SampleSheet(samplesheet_all),
            counts_table=counts_all,
        )
        return ds
Ejemplo n.º 5
0
#!/usr/bin/env python
# vim: fdm=indent
'''
author:     Fabio Zanini
date:       15/08/17
content:    Test CountsTable class.
'''
# Script
if __name__ == '__main__':

    # NOTE: an env variable for the config file needs to be set when
    # calling this script
    from singlet.counts_table import CountsTable
    ct = CountsTable.from_tablename('example_table_tsv')

    print('Test binning of CountsTable')
    ct = ct.iloc[:200]
    ct.bin(result='index', inplace=True)
    assert (ct.values.max() == 4)
    print('Done!')
Ejemplo n.º 6
0
def test_initialize():
    from singlet.counts_table import CountsTable
    ct = CountsTable.from_tablename('example_table_tsv')
Ejemplo n.º 7
0
def test_initialize_fromdataset():
    from singlet.counts_table import CountsTable
    ct = CountsTable.from_datasetname('example_dataset')