def get_dataset(tissue, membrane_only=True): counts = parse_counts(tissue) if membrane_only: go = parse_go_plasma_membrane() genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=CountsTable(counts), ) return ds
def test_initialize(): from singlet.samplesheet import SampleSheet ss = SampleSheet.from_sheetname('example_sheet_tsv')
def test_initialize_fromdataset(): from singlet.samplesheet import SampleSheet ct = SampleSheet.from_datasetname('example_dataset')
def get_dataset(tissue, membrane_only=True, regenerate=False, go_contains=None, go_exclude=None): # Some tissues like brain were split for sorting, we merge them here dss = [] for tissue_facs in tissues_prediction[tissue]: cell_types, plates = parse_annotations(tissue_facs) counts = parse_counts(tissue_facs, regenerate=regenerate) if membrane_only: go = parse_go_plasma_membrane().index genes_membrane = go[go.isin(counts.index)] counts = counts.loc[genes_membrane] if (go_contains is not None) and (go_exclude is not None): raise ValueError('Use either go_contains or go_exclude') if go_contains is not None: go = parse_go_plasma_membrane() genes = go.index[go['GONames'].str.contains(go_contains)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] elif go_exclude is not None: go = parse_go_plasma_membrane() genes = go.index[~go['GONames'].str.contains(go_exclude)] genes = np.intersect1d(genes, counts.index) counts = counts.loc[genes] dss.append({'samplesheet': cell_types, 'counts': counts}) if len(dss) == 1: ds = Dataset( samplesheet=SampleSheet(cell_types), counts_table=counts, ) return ds else: # Merging is kind of messy because some genes are absent from either # subtissue (grrr); I put zeroes for now, Michelle is working on the # better solution (we have those numbers somewhere) genes = set() for ds in dss: genes |= set(ds['counts'].index.values) genes = pd.Index(sorted(genes), name=ds['counts'].index.name) for ds in dss: genes_missing = genes[~genes.isin(ds['counts'].index)] for gene in genes_missing: # The stuff is normalized, pseudocounted, and logged ds['counts'].loc[gene] = -1.0 ds['counts'] = ds['counts'].loc[genes] ngenes = len(genes) ncells = sum(ds['samplesheet'].shape[0] for ds in dss) samplesheet_all = pd.concat([ds['samplesheet'] for ds in dss], axis=0) counts_all = pd.DataFrame(np.zeros((ngenes, ncells), float), index=genes, columns=samplesheet_all.index) for ds in dss: counts_all.loc[:, ds['counts'].columns.values] = ds['counts'].values counts_all = CountsTable(counts_all) if ds['counts']._normalized: counts_all._normalized = ds['counts']._normalized ds = Dataset( samplesheet=SampleSheet(samplesheet_all), counts_table=counts_all, ) return ds
#!/usr/bin/env python # vim: fdm=indent ''' author: Fabio Zanini date: 15/08/17 content: Test SampleSheet class. ''' # Script if __name__ == '__main__': # NOTE: an env variable for the config file needs to be set when # calling this script print('Instantiating SampleSheet') from singlet.samplesheet import SampleSheet ss = SampleSheet.from_sheetname('example_sheet_tsv') print('Done!')