Beispiel #1
0
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None,
          first_column_names=None, backup_url=None, cache=False,
          suppress_cache_warning=False):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         + avail_exts)
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = check_datafile_present_and_download(filename,
                                                     backup_url=backup_url)
    if not is_present: logg.msg('... did not find original file', filename)
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.msg('reading sheet', sheet, 'from file', filename, v=4)
            return read_hdf(filename, sheet)
    # read other file types
    filename_cache = (settings.cachedir + filename.lstrip(
        './').replace('/', '-').replace('.' + ext, '.h5ad'))
    if cache and os.path.exists(filename_cache):
        logg.info('... reading from cache file', filename_cache)
        adata = read_h5ad(filename_cache, backed=False)
    else:
        if not is_present:
            raise FileNotFoundError('Did not find file {}.'.format(filename))
        logg.msg('reading', filename, v=4)
        if not cache and not suppress_cache_warning:
            logg.hint('This might be very slow. Consider passing `cache=True`, '
                      'which enables much faster reading from a cache file.')
        # do the actual reading
        if ext == 'xlsx' or ext == 'xls':
            if sheet is None:
                raise ValueError(
                    'Provide `sheet` parameter when reading \'.xlsx\' files.')
            else:
                adata = read_excel(filename, sheet)
        elif ext == 'mtx':
            adata = read_mtx(filename)
        elif ext == 'csv':
            adata = read_csv(filename, first_column_names=first_column_names)
        elif ext in {'txt', 'tab', 'data', 'tsv'}:
            if ext == 'data':
                logg.msg('... assuming \'.data\' means tab or white-space '
                         'separated text file', v=3)
                logg.hint('change this by passing `ext` to sc.read')
            adata = read_text(filename, delimiter, first_column_names)
        elif ext == 'soft.gz':
            adata = _read_softgz(filename)
        else:
            raise ValueError('Unkown extension {}.'.format(ext))
        if cache:
            logg.info('... writing an', settings.file_format_data,
                      'cache file to speedup reading next time')
            if not os.path.exists(os.path.dirname(filename_cache)):
                os.makedirs(os.path.dirname(filename_cache))
            # write for faster reading when calling the next time
            adata.write(filename_cache)
    return adata
Beispiel #2
0
 def setUp(self):
     self.adata = ad.read_csv(input_file)  # regular anndata
     input_file_zarr = tmp_dir()
     self.adata.write_zarr(
         input_file_zarr,
         chunks=(2, 5))  # write as zarr, so we can read using a RDD
     self.adata_rdd = AnnDataRdd.from_zarr(self.sc, input_file_zarr)
Beispiel #3
0
def load_file(filepath):
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            # TODO remove transpose
            adata = anndata.read_csv(filepath).T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[
                -4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    adata.uns['dataset'] = dataset
    return adata
 def from_csv(cls, sc, csv_file, chunk_size):
     """
     Read a CSV file as an anndata object (for the metadata) and with the
     data matrix (X) as an RDD of numpy arrays.
     *Note* the anndata object currently also stores the data matrix, which is
     redundant and won't scale. This should be improved, possibly by changing anndata.
     """
     adata = ad.read_csv(csv_file)
     return cls._from_anndata(sc, adata, chunk_size,
                              read_chunk_csv(csv_file, chunk_size))
Beispiel #5
0
def load_mouseAllen_10X_data(data_dir, ind=None, celltype_gran=0, curate=False):
    '''Load mouse Allen cortex data from 10X reads

    @ind: individual sample -> 49 mice originally, 3 remained after filtering on male 
    @celltype_gran: major cell types (0) or sub-cell types (1)
    @curate: whether to curate to the same name as mouse FC, mouse pFC, mouse protocol
    '''
    adata = anndata.read_csv(data_dir+os.sep+'AllenBrain/filtered_matrix.csv')
    adata.var["gene_symboles"] = adata.var.index
    adata.var_names_make_unique(join="-")
    adata.obs['barcode'] = adata.obs.index
    adata.obs_names_make_unique(join="-")

    ## load metadata
    meta_df = pd.read_csv(data_dir+os.sep+'AllenBrain/filtered_metadata.csv', index_col=0)
    meta_df['external_donor_name_label'] = meta_df['external_donor_name_label'].astype(str)
    sample_info = pd.read_csv(data_dir+os.sep+'AllenBrain/sample_info.csv', index_col=0) ## use to filter gender
    merged_meta_df = meta_df.merge(sample_info, how='left', left_on="external_donor_name_label", right_on="external_donor_name")
    adata.obs = adata.obs.merge(merged_meta_df, left_index=True, right_on="sample_name", how='left')  ## merge anndata with metadata
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None
    adata = adata[adata.obs['sex_label'] == 'M']  ## filter males out

    if 0 == celltype_gran:
        if curate:
            adata_obs = adata.obs
            adata_obs['cell.type'].replace(['L2 IT HATA', 'L2 IT RSP-ACA', 'L2/3 IT APr', 
                'L2/3 IT CTX', 'L2/3 IT PPP', 'L3 RSP-ACA', 'L4/5 IT CTX', 'L5 IT CTX', 
                'L5 NP CTX', 'L5 PT CTX', 'L5 PT RSP-ACA', 'L5/6 IT CTX', 'L6 CT CTX',
                'L6 Car3', 'L6 IT CTX', 'L6 NP CT CTX', 'L6b CTX','NP PPP', 'NP SUB', 
                'CA2-IG-FC', 'L6b RHP', 'L6 IT RHP', 'L2/3 IT ENTl', 'L2 IT RSPv', 
                'L5 IT TPE-ENT', 'L2/3 IT TPE', 'L2 IT ProS'], 'Neuron', inplace=True) ## 27 types
            adata_obs['cell.type'].replace(['Sncg', 'Pvalb', 'Pvalb Vipr2', 'Sst',
                'Sst Chodl', 'Vip', 'Lamp5', 'Lamp5 Lhx6', 'Pax6', 'Ndnf HPF'], 'Interneuron', inplace=True) ## 10 types
            adata_obs['cell.type'].replace(['Astro'], 'Astrocytes', inplace=True)
            adata_obs['cell.type'].replace(['Endo'], 'Endothelial', inplace=True)
            adata_obs['cell.type'].replace(['Micro'], 'Microglia', inplace=True)
            adata_obs['cell.type'].replace(['OPC'], 'Polydendrocytes', inplace=True)
            adata_obs['cell.type'].replace(['Oligo'], 'Oligodendrocytes', inplace=True)
            adata_obs['cell.type'].replace(['Peri'], 'Pericytes', inplace=True)
            adata.obs = adata_obs
            ## 'CR', 'PVM', 'SMC', 'VLMC' celltype are out of all other database, therefore not included
            remained_idx = ~adata.obs['cell.type'].isin(['CR', 'PVM', 'SMC', 'VLMC'])
            adata = adata[remained_idx]

    if 1 == celltype_gran: ## for sub-celltypes
        adata.obs.rename(columns={"cell.type": "major_celltype"}, inplace=True)
        adata.obs.rename(columns={"cell_type_alias_label": "cell.type"}, inplace=True)

    if ind is not None:
        ind_cells = adata.obs[adata.obs["external_donor_name_label"].isin(ind.split('_'))].index
        adata = adata[ind_cells]

    return adata
Beispiel #6
0
def load_mouseAllen_SS_data(data_dir, ind=None, celltype_gran=0, curate=False):
    '''Load mouse Allen cortex data from Smart-seq exon reads

    @ind: individual sample -> 529 mice originally, 19 mice after filtering
    @celltype_gran: major cell types (0) or sub-cell types (1)
    @curate: whether to curate to the same name as mouse FC, mouse pFC, mouse protocol
    '''
    adata = anndata.read_csv(data_dir+os.sep+'AllenBrain/filtered_SS_matrix.csv')
    adata.var["gene_symboles"] = adata.var.index
    adata.var_names_make_unique(join="-")
    adata.obs['barcode'] = adata.obs.index
    adata.obs_names_make_unique(join="-")

    ## load metadata
    meta_df = pd.read_csv(data_dir+os.sep+'AllenBrain/filtered_SS_metadata.csv', index_col=0)
    adata.obs = adata.obs.merge(meta_df, left_on='barcode', right_on='barcode', how='left')  ## merge anndata with metadata
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None
    adata.obs = adata.obs[['barcode', 'donor_label', 'region_label', 'joint_region_label', 'label', 'cluster_label', 'ss_cluster_label']]

    if 0 == celltype_gran:
        adata.obs.rename(columns={'label':'cell.type'}, inplace=True)
        if curate:
            adata_obs = adata.obs
            adata_obs['cell.type'].replace(['L2/3 IT CTX', 'L2/3 IT RSP', 'L2/3 IT RSPv',
                'L2 IT ENTl', 'L4/5 IT CTX', 'L4 RSP-ACA', 'L5/6 IT CTX', 'L5 IT CTX',
                'L5 IT TPE-ENT', 'L5 NP CT CTX ', 'L5 NP CTX', 'L5 PT CTX', 'L6b CTX', 
                'L6 CT CTX', 'L6 IT CTX', 'CA1', 'CA1-ProS', 'CA2-IG-FC', 'CA3',
                'CT SUB', 'Car3', 'DG', 'L2 IT ENTm', 'L2 IT PAR', 'L2/3 IT APr',
                'L2/3 IT ENTl', 'L2/3 IT HATA', 'L2/3 IT PPP', 'L3 IT ENTl', 'L3 IT ENTm',
                'L5 PPP', 'L6 IT ENTl', 'L6b/CT ENT', 'NP PPP', 'NP SUB', 'SUB', 'SUB-ProS'], 'Neuron', inplace=True)
            adata_obs['cell.type'].replace(['Lamp5', 'Lamp5 Lhx6', 'Pax6', 'Pvalb',
                'Sncg', 'Sst', 'Sst Chodl', 'Vip'], 'Interneuron', inplace=True)
            adata_obs['cell.type'].replace(['Astro'], 'Astrocytes', inplace=True)
            adata_obs['cell.type'].replace(['Endo'], 'Endothelial', inplace=True)
            adata_obs['cell.type'].replace(['Micro-PVM'], 'Microglia', inplace=True)
            adata_obs['cell.type'].replace(['Oligo'], 'Oligodendrocytes', inplace=True)
            adata_obs['cell.type'].replace(['SMC-Peri'], 'Pericytes', inplace=True)
            adata.obs = adata_obs
            ## 'CR', 'Meis2', 'Meis2 HPF', 'Ndnf HPF', 'VLMC' celltype are out of all other database, therefore not included
            ## The polydendrocytes (OPCs) are missing
            remained_idx = ~adata.obs['cell.type'].isin(['CR', 'Meis2', 'Meis2 HPF', 'Ndnf HPF', 'VLMC'])
            adata = adata[remained_idx]

    if 1 == celltype_gran: ## for sub-celltypes
        adata.obs.rename(columns={"cluster_label": "cell.type"}, inplace=True)

    if ind is not None:
        ind_cells = adata.obs[adata.obs["donor_label"].isin(ind.split('_'))].index
        adata = adata[ind_cells]

    return adata
Beispiel #7
0
def _load_csv(
    path_to_file: str,
    gene_by_cell: bool = False,
    delimiter: str = ",",
    first_column_names: bool = None,
):
    logger.info("Loading dataset from {}".format(path_to_file))
    adata = anndata.read_csv(path_to_file,
                             delimiter=delimiter,
                             first_column_names=first_column_names)
    if gene_by_cell:
        adata.X = adata.X.T
    logger.info("Finished loading dataset")
    return adata
Beispiel #8
0
def test_anndata():
    try:
        anndata
    except NameError:
        # anndata not installed
        return
    scdata = anndata.read_csv("../data/test_data.csv")
    fast_magic_operator = magic.MAGIC(t='auto', a=None, k=10)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes")
    assert np.all(sc_magic.var_names == scdata.var_names)
    assert np.all(sc_magic.obs_names == scdata.obs_names)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes=['VIM', 'ZEB1'])
    assert np.all(sc_magic.var_names.values == np.array(['VIM', 'ZEB1']))
    assert np.all(sc_magic.obs_names == scdata.obs_names)
Beispiel #9
0
def test_anndata():
    try:
        anndata
    except NameError:
        # anndata not installed
        return
    scdata = anndata.read_csv(data_path)
    fast_magic_operator = magic.MAGIC(
        t="auto", solver="approximate", decay=None, knn=10, verbose=False
    )
    sc_magic = fast_magic_operator.fit_transform(scdata, genes="all_genes")
    assert np.all(sc_magic.var_names == scdata.var_names)
    assert np.all(sc_magic.obs_names == scdata.obs_names)
    sc_magic = fast_magic_operator.fit_transform(scdata, genes=["VIM", "ZEB1"])
    assert np.all(sc_magic.var_names.values == np.array(["VIM", "ZEB1"]))
    assert np.all(sc_magic.obs_names == scdata.obs_names)
Beispiel #10
0
def load_file(filepath):
    t_flag = False
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
        t_flag = True
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            adata = anndata.read_csv(filepath)
            if t_flag:
                adata = adata.T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    # Make sure cluster names are in proper format
    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)

    adata.uns['dataset'] = dataset
    return adata
Beispiel #11
0
def ReadData(feature, phen):
    RawData = ad.read_csv(feature, delimiter='\t', first_column_names=True)
    phen = pd.read_csv(phen, sep='\t', header=0, index_col=0)
    phen.index = [str(i) for i in phen.index]
    RawData.obs['phen'] = phen.loc[RawData.obs.index, :]
    return RawData
 def read_one_chunk(chunk_index):
     adata = ad.read_csv(csv_file)
     return read_adata_chunk(adata, chunk_size, chunk_index)
Beispiel #13
0
def _read(
    filename: Path,
    backed=None,
    sheet=None,
    ext=None,
    delimiter=None,
    first_column_names=None,
    backup_url=None,
    cache=False,
    cache_compression=None,
    suppress_cache_warning=False,
    **kwargs,
):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         f'{avail_exts}')
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = _check_datafile_present_and_download(
        filename,
        backup_url=backup_url,
    )
    if not is_present:
        logg.debug(f'... did not find original file {filename}')
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.debug(f'reading sheet {sheet} from file {filename}')
            return read_hdf(filename, sheet)
    # read other file types
    path_cache = settings.cachedir / _slugify(filename).replace(
        '.' + ext, '.h5ad')  # type: Path
    if path_cache.suffix in {'.gz', '.bz2'}:
        path_cache = path_cache.with_suffix('')
    if cache and path_cache.is_file():
        logg.info(f'... reading from cache file {path_cache}')
        return read_h5ad(path_cache)

    if not is_present:
        raise FileNotFoundError(f'Did not find file {filename}.')
    logg.debug(f'reading {filename}')
    if not cache and not suppress_cache_warning:
        logg.hint('This might be very slow. Consider passing `cache=True`, '
                  'which enables much faster reading from a cache file.')
    # do the actual reading
    if ext == 'xlsx' or ext == 'xls':
        if sheet is None:
            raise ValueError(
                "Provide `sheet` parameter when reading '.xlsx' files.")
        else:
            adata = read_excel(filename, sheet)
    elif ext in {'mtx', 'mtx.gz'}:
        adata = read_mtx(filename)
    elif ext == 'csv':
        adata = read_csv(filename, first_column_names=first_column_names)
    elif ext in {'txt', 'tab', 'data', 'tsv'}:
        if ext == 'data':
            logg.hint(
                "... assuming '.data' means tab or white-space "
                'separated text file', )
            logg.hint('change this by passing `ext` to sc.read')
        adata = read_text(filename, delimiter, first_column_names)
    elif ext == 'soft.gz':
        adata = _read_softgz(filename)
    elif ext == 'loom':
        adata = read_loom(filename=filename, **kwargs)
    else:
        raise ValueError(f'Unknown extension {ext}.')
    if cache:
        logg.info(f'... writing an {settings.file_format_data} '
                  'cache file to speedup reading next time')
        if cache_compression is _empty:
            cache_compression = settings.cache_compression
        if not path_cache.parent.is_dir():
            path_cache.parent.mkdir(parents=True)
        # write for faster reading when calling the next time
        adata.write(path_cache, compression=cache_compression)
    return adata
Beispiel #14
0
def get_clusters_et_al(path,
                       size=5,
                       filter_ncounts=False,
                       filter_mito=False,
                       reload_file=False):
    f = path
    ext = os.path.splitext(f)[-1]
    if not reload_file and (ext == '.h5ad'
                            or os.path.exists(path.replace(ext, '.h5ad'))):
        results_file = path.replace(ext, '.h5ad')
        adata = ad.read_h5ad(results_file)
    elif ext == '.csv' or os.path.exists(path.replace(ext, '.csv')):
        results_file = path.replace(ext, '.h5ad')
        path = path.replace(ext, '.csv')
        adata = ad.read_csv(path).transpose()
        sc.pp.filter_genes(adata, min_cells=3)
        sc.pp.filter_cells(adata, min_genes=200)
        if filter_ncounts:
            if isinstance(filter_ncounts, bool):
                adata.obs['n_counts'] = adata.X.sum(axis=1)
                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
                ax1.hist(adata.obs.n_genes,
                         bins=100,
                         range=(0, np.percentile(adata.obs.n_genes, 99)))
                ax1.set_xlabel('Number of counts')
                ax1.set_ylabel('Number of cells')
                ax2.hist(adata.obs.n_genes,
                         bins=100,
                         cumulative=True,
                         density=True,
                         range=(0, np.percentile(adata.obs.n_genes, 99)))
                ax2.set_xlabel('Number of counts')
                ax2.set_ylabel('Ratio of cells')
                ax2.grid(True, axis='both')
                fig.tight_layout()
                plt.show()
                th_ncount = input(
                    'Please enter the threshold value for the maximum number of counts: '
                )
                th_ncount = get_threshold(th_ncount, adata.obs.n_genes)
                while not is_number(th_ncount):
                    th_ncount = input('Please enter a numeric value: ')
                th_ncount = float(th_ncount)
            else:
                th_ncount = filter_ncounts
            fig, ax = plt.subplots(1, 1)
            filter_tab_ncounts = adata.obs.n_genes < th_ncount
            ax.hist([
                adata.obs.n_genes[filter_tab_ncounts],
                adata.obs.n_genes[filter_tab_ncounts == False]
            ],
                    color=['k', 'r'],
                    label=['kept', 'removed'],
                    bins=100,
                    histtype='barstacked',
                    range=(0, np.percentile(adata.obs.n_genes, 99)))
            ax.set_xlabel('Number of counts')
            ax.set_ylabel('Number of cells')
            ax.legend()
            plt.show()
        else:
            filter_tab_ncounts = np.ones(adata.shape[0], dtype=bool)
        if filter_mito:
            if isinstance(filter_mito, bool):
                mito_genes = (adata.var_names.str.startswith('mt-')
                              | adata.var_names.str.startswith('Mt-')
                              | adata.var_names.str.startswith('MT-'))
                adata.obs['percent_mito'] = np.sum(
                    adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 4))
                ax1.hist(adata.obs.percent_mito,
                         bins=100,
                         range=(0, np.percentile(adata.obs.percent_mito, 99)))
                ax1.set_xlabel('Percent of mito expression')
                ax1.set_ylabel('Number of cells')
                ax2.hist(adata.obs.percent_mito,
                         bins=100,
                         cumulative=True,
                         density=True,
                         range=(0, np.percentile(adata.obs.percent_mito, 99)))
                ax2.set_xlabel('Percent of mito expression')
                ax2.set_ylabel('Ratio of cells')
                ax2.grid(True, axis='both')
                fig.tight_layout()
                plt.show()
                th_mito = input(
                    'Please enter the threshold value for the maximum percent of mito expression: '
                )
                th_mito = get_threshold(th_mito, adata.obs.percent_mito)
                while not is_number(th_mito):
                    th_mito = input('Please enter a numeric value: ')
                th_mito = float(th_mito)
            else:
                th_mito = filter_mito
            plt.close(fig)
            filter_tab_mito = adata.obs.percent_mito < th_mito
            fig, ax = plt.subplots(1, 1)
            ax.hist([
                adata.obs.percent_mito[filter_tab_mito],
                adata.obs.percent_mito[filter_tab_mito == False]
            ],
                    color=['k', 'r'],
                    label=['kept', 'removed'],
                    bins=100,
                    histtype='barstacked',
                    range=(0, np.percentile(adata.obs.percent_mito, 99)))
            ax.set_xlabel('Number of counts')
            ax.set_ylabel('Number of cells')
            ax.legend()
            plt.show()
        else:
            filter_tab_mito = np.ones(adata.shape[0], dtype=bool)
        final_filt = np.ones(adata.shape[0], dtype=bool)
        if filter_ncounts:
            final_filt[filter_tab_ncounts == False] = False
        if filter_mito:
            final_filt[filter_tab_mito == False] = False
        both = np.sum((filter_tab_ncounts.astype(int) + filter_tab_mito) == 0)
        diff = filter_tab_ncounts.astype(int) - filter_tab_mito
        nc = np.sum(diff == -1)
        mito = np.sum(diff == 1)
        pie_values = [np.sum(final_filt), nc, both, mito]
        fig, ax = plt.subplots()
        ax.pie(pie_values,
               labels=['Kept', 'ncounts', 'both', 'mito'],
               shadow=False,
               startangle=90)
        ax.axis('equal')

        adata = adata[final_filt, :]
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata,
                                    min_mean=0.0125,
                                    max_mean=3,
                                    min_disp=0.5)
        adata.raw = adata
        adata = adata[:, adata.var.highly_variable]
        sc.pp.scale(adata, max_value=10)
        sc.tl.pca(adata, svd_solver='arpack')
        sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
        sc.tl.umap(adata)
        sc.tl.leiden(adata, .25)
        adata.write(results_file)
    else:
        print('Can only work with .csv or .h5ad files (you gave {})'.format(
            path))
        return
    # sc.pl.highly_variable_genes(adata.raw)
    sc.pl.pca(adata, color=['Bmp2', 'Sox9', 'Sox17'], size=size)
    sc.pl.pca_variance_ratio(adata, log=True)

    fig = sc.pl.umap(adata,
                     color=['Bmp2', 'Sox9', 'Wnt3'],
                     size=size,
                     show=False,
                     return_fig=True)
    fig.set_figwidth(20)
    fig.set_figheight(6)

    c_name = os.path.splitext(
        os.path.split('data/' + results_file)[-1])[0] + '.0.25'
    fig = sc.pl.umap(adata,
                     color=['leiden', 'Bmp2', 'Sox9'],
                     size=size,
                     show=False,
                     return_fig=True)
    fig.set_figwidth(20)
    fig.set_figheight(5)
    fig.savefig('figures/Umap.{:s}.pdf'.format(c_name))

    sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
    sc.pl.rank_genes_groups(adata,
                            n_genes=25,
                            sharey=False,
                            show=True,
                            return_fig=True,
                            save='.t-test.{:s}.pdf'.format(c_name))
    sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
    sc.pl.rank_genes_groups(adata,
                            n_genes=25,
                            sharey=False,
                            show=True,
                            return_fig=True,
                            save='.wilcoxon.{:s}.pdf'.format(c_name))
    return adata
def load_FC_adata(data_dir,
                  devstage=None,
                  ind=None,
                  treatment=None,
                  celltype_gran=0,
                  curate=False):
    '''Loading data from mouse frontal cortex along with metadata information

    @devstage: P21 or Adult
    @ind: individual sample prediction: P21Sample1-3, PFCSample1-12
    @treatment: Saline or Cocaine
    @celltype_gran: major cell types or sub-cell types
    '''
    ## load data; set genes and cells
    adata = anndata.read_csv(
        data_dir + os.sep +
        "MouseFC_GSE124952/GSE124952_expression_matrix.csv").T

    adata.var["gene_symbols"] = adata.var.index
    adata.var_names_make_unique(join="-")

    adata.obs['barcode'] = adata.obs.index
    adata.obs_names_make_unique(join="-")

    ## load metadata
    meta_df = pd.read_csv(data_dir + os.sep +
                          "MouseFC_GSE124952/GSE124952_meta_data.csv",
                          index_col=0)
    adata.obs = adata.obs.merge(meta_df, left_index=True, right_index=True)

    if 0 == celltype_gran:
        adata.obs.rename(columns={"CellType": "cell.type"},
                         inplace=True)  ## change cell type names

        if curate:  ## if curation is needed for cross dataset prediction
            adata_obs = adata.obs
            #adata_obs["cell.type"].replace(['Oligo', 'NF Oligo'], 'Oligodendrocytes', inplace=True)
            #adata_obs["cell.type"].replace(['OPC'], 'Polydendrocytes', inplace=True)
            adata_obs["cell.type"].replace(['Oligo'],
                                           'Oligodendrocytes',
                                           inplace=True)
            adata_obs["cell.type"].replace(['OPC', 'NF Oligo'],
                                           'Polydendrocytes',
                                           inplace=True)
            adata_obs["cell.type"].replace(['Astro'],
                                           'Astrocytes',
                                           inplace=True)
            adata_obs["cell.type"].replace(['Excitatory'],
                                           'Neuron',
                                           inplace=True)
            adata_obs["cell.type"].replace(['Inhibitory'],
                                           'Interneuron',
                                           inplace=True)
            adata_obs["cell.type"].replace(['Endo'],
                                           'Endothelial',
                                           inplace=True)
            ## Microglia stays as microglia
            adata.obs = adata_obs

    elif 1 == celltype_gran:
        adata.obs.rename(columns={"L2_clusters": "cell.type"}, inplace=True)

    ## subset individuals
    if ind is not None:
        ind_cells = adata.obs[adata.obs['Sample'].isin(ind.split('_'))].index
        adata = adata[ind_cells]

    if devstage is not None:
        dev_cells = adata.obs[adata.obs["DevStage"] == devstage].index
        adata = adata[dev_cells]

    if treatment is not None:
        treat_cells = adata.obs[adata.obs["treatment"] == treatment].index
        adata = adata[treat_cells]

    return adata
Beispiel #16
0
def test_read_csv():
    adata = ad.read_csv(HERE / 'adata.csv')
    assert adata.obs_names.tolist() == ['r1', 'r2', 'r3']
    assert adata.var_names.tolist() == ['c1', 'c2']
    assert adata.X.tolist() == X_list
Beispiel #17
0
import sys
sys.path.append(".")  # Adds higher directory to python modules path.

import anndata
import pycogaps
import scipy.io
import scipy.sparse
import numpy as np
from PyCoGAPS import *

# placeholder until we have anndata samples
# maybe also read files into an anndata object?
path = './data/GIST.csv'
prm = pycogaps.GapsParameters(path)

adata = anndata.read_csv(path)
adataX = adata.X

if scipy.sparse.issparse(adataX):
    adataX = adataX.toarray()

# create Matrix object from anndata X
matrix = pycogaps.Matrix(adataX)

result = pycogaps.runCogapsFromMatrix(matrix, prm)

# convert Amean and Pmean results to numpy arrays
Amean = toNumpy(result.Amean)
Pmean = toNumpy(result.Pmean)

# anndata labels
Beispiel #18
0
def test_read_csv():
    adata = ad.read_csv(HERE / "adata.csv")
    assert adata.obs_names.tolist() == ["r1", "r2", "r3"]
    assert adata.var_names.tolist() == ["c1", "c2"]
    assert adata.X.tolist() == X_list
Beispiel #19
0
def _load_input_file(path):
    if path[-5:] == '.h5ad':
        adata = anndata.read_h5ad(path)
    elif path[-4:] == '.csv':
        adata = anndata.read_csv(path)
    return adata