Exemple #1
0
def load_file(filepath):
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            # TODO remove transpose
            adata = anndata.read_csv(filepath).T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[
                -4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    adata.uns['dataset'] = dataset
    return adata
Exemple #2
0
def _read(filename, backed=False, sheet=None, ext=None, delimiter=None,
          first_column_names=None, backup_url=None, cache=False,
          suppress_cache_warning=False):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         + avail_exts)
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = check_datafile_present_and_download(filename,
                                                     backup_url=backup_url)
    if not is_present: logg.msg('... did not find original file', filename)
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.msg('reading sheet', sheet, 'from file', filename, v=4)
            return read_hdf(filename, sheet)
    # read other file types
    filename_cache = (settings.cachedir + filename.lstrip(
        './').replace('/', '-').replace('.' + ext, '.h5ad'))
    if cache and os.path.exists(filename_cache):
        logg.info('... reading from cache file', filename_cache)
        adata = read_h5ad(filename_cache, backed=False)
    else:
        if not is_present:
            raise FileNotFoundError('Did not find file {}.'.format(filename))
        logg.msg('reading', filename, v=4)
        if not cache and not suppress_cache_warning:
            logg.hint('This might be very slow. Consider passing `cache=True`, '
                      'which enables much faster reading from a cache file.')
        # do the actual reading
        if ext == 'xlsx' or ext == 'xls':
            if sheet is None:
                raise ValueError(
                    'Provide `sheet` parameter when reading \'.xlsx\' files.')
            else:
                adata = read_excel(filename, sheet)
        elif ext == 'mtx':
            adata = read_mtx(filename)
        elif ext == 'csv':
            adata = read_csv(filename, first_column_names=first_column_names)
        elif ext in {'txt', 'tab', 'data', 'tsv'}:
            if ext == 'data':
                logg.msg('... assuming \'.data\' means tab or white-space '
                         'separated text file', v=3)
                logg.hint('change this by passing `ext` to sc.read')
            adata = read_text(filename, delimiter, first_column_names)
        elif ext == 'soft.gz':
            adata = _read_softgz(filename)
        else:
            raise ValueError('Unkown extension {}.'.format(ext))
        if cache:
            logg.info('... writing an', settings.file_format_data,
                      'cache file to speedup reading next time')
            if not os.path.exists(os.path.dirname(filename_cache)):
                os.makedirs(os.path.dirname(filename_cache))
            # write for faster reading when calling the next time
            adata.write(filename_cache)
    return adata
Exemple #3
0
    def load_data_mtx(self,
                      mtx_file,
                      mtx_obs=None,
                      mtx_feature=None,
                      meta_data_file=None,
                      meta_data_handler=DEFAULT_METADATA,
                      gene_data_file=None,
                      gene_name_column=None):

        data = anndata.read_mtx(self.input_path(mtx_file))
        row_names = self._load_list_from_file(
            self.input_path(mtx_obs)) if mtx_obs is not None else None
        col_names = self._load_list_from_file(
            self.input_path(mtx_feature)) if mtx_feature is not None else None

        meta_data = self.load_metadata_tsv(meta_data_file,
                                           data.obs_names,
                                           meta_data_handler=meta_data_handler)
        gene_metadata = self.load_gene_metadata_tsv(gene_data_file,
                                                    gene_name_column)

        data = InferelatorData(data,
                               meta_data=meta_data,
                               gene_data=gene_metadata,
                               sample_names=row_names,
                               gene_names=col_names)

        return data
Exemple #4
0
def read_10X(data_path, var_names='gene_symbols'):

    adata = read_mtx(data_path + '/matrix.mtx').T

    genes = pd.read_csv(data_path + '/genes.tsv', header=None, sep='\t')
    adata.var['gene_ids'] = genes[0].values
    adata.var['gene_symbols'] = genes[1].values

    assert var_names == 'gene_symbols' or var_names == 'gene_ids', \
        'var_names must be "gene_symbols" or "gene_ids"'

    if var_names == 'gene_symbols':
        var_names = genes[1]
    else:
        var_names = genes[0]

    if not var_names.is_unique:
        var_names = make_index_unique(pd.Index(var_names))
        print('var_names are not unique, "make_index_unique" has applied')

    adata.var_names = var_names

    cells = pd.read_csv(data_path + '/barcodes.tsv', header=None, sep='\t')
    adata.obs['barcode'] = cells[0].values
    adata.obs_names = cells[0]
    return adata
Exemple #5
0
def read_dataset(path, obs=None, var=None, obs_filter=None, var_filter=None, **keywords):
    """
    Read h5ad, loom, mtx, 10X h5, and csv formatted files

    Parameters
    ----------
    path: str
        File name of data file.
    obs: {str, pd.DataFrame}
        Path to obs data file or a data frame
    var: {str, pd.DataFrame}
        Path to var data file or a data frame
    obs_filter {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    var_filter: {str, pd.DataFrame}
        File with one id per line, name of a boolean field in obs, or a list of ids
    Returns
    -------
    Annotated data matrix.
    """

    _, ext = os.path.splitext(str(path).lower())
    if ext == '.txt':
        df = pd.read_csv(path, engine='python', header=0, sep=None, index_col=0)
        adata = anndata.AnnData(X=df.values, obs=pd.DataFrame(index=df.index), var=pd.DataFrame(index=df.columns))
    elif ext == '.h5ad':
        adata = anndata.read(path)
    elif ext == '.loom':
        adata = anndata.read_loom(path)
    elif ext == '.mtx':
        adata = anndata.read_mtx(path)
    elif ext == '.zarr':
        adata = anndata.read_zarr(path)
    else:
        raise ValueError('Unknown file format: {}'.format(ext))

    def get_df(meta):
        if not isinstance(meta, pd.DataFrame):
            tmp_path = None
            if meta.startswith('gs://'):
                tmp_path = download_gs_url(meta)
                meta = tmp_path
            meta = pd.read_csv(meta, sep=None, index_col='id', engine='python')
            if tmp_path is not None:
                os.remove(tmp_path)
        return meta

    if obs is not None:
        if not isinstance(obs, list) and not isinstance(obs, tuple):
            obs = [obs]
        for item in obs:
            adata.obs = adata.obs.join(get_df(item))
    if var is not None:
        if not isinstance(var, list) and not isinstance(var, tuple):
            var = [var]
        for item in var:
            adata.var = adata.var.join(get_df(item))

    return filter_adata(adata, obs_filter=obs_filter, var_filter=var_filter)
Exemple #6
0
def main():
    """Run the script"""
    parser = build_parser()
    args = parser.parse_args()

    cell_df = pd.read_csv(
        args.cell_info,
        delimiter=","
        if utils.get_file_extension_no_gz(args.cell_info) == "csv" else "\t",
        index_col=args.cellindexcol,
        header=None if args.noheader else "infer",  # 'infer' is default
    )
    if "Barcodes" in cell_df.columns and args.cellindexcol is not None:
        cell_df.index = cell_df["Barcodes"]
    cell_df.index = cell_df.index.rename("barcode")
    cell_df.columns = cell_df.columns.map(str)

    logging.info(f"Read cell metadata from {args.cell_info} {cell_df.shape}")
    logging.info(f"Cell metadata cols: {cell_df.columns}")
    logging.info(cell_df)

    var_df = pd.read_csv(
        args.var_info,
        delimiter=","
        if utils.get_file_extension_no_gz(args.var_info) == "csv" else "\t",
        index_col=args.varindexcol,
        header=None if args.noheader else "infer",  # 'infer' is default
    )
    if "Feature" in var_df.columns and args.varindexcol is not None:
        var_df.index = [ensure_sane_interval(s) for s in var_df["Feature"]]
    var_df.index = var_df.index.rename("ft")
    var_df.columns = var_df.columns.map(str)
    # var_df.index = var_df.index.map(str)
    logging.info(f"Read variable metadata from {args.var_info} {var_df.shape}")
    logging.info(f"Var metadata cols: {var_df.columns}")
    logging.info(var_df)

    # Transpose because bio considers rows to be features
    adata = ad.read_mtx(args.mat_file).T
    logging.info(f"Read matrix {args.mat_file} {adata.shape}")
    adata.obs = cell_df
    adata.var = var_df
    logging.info(f"Created AnnData object: {adata}")
    logging.info(f"Obs names: {adata.obs_names}")
    logging.info(f"Var names: {adata.var_names}")

    if args.reindexvar:
        assert args.varindexcol is not None, "Must provide var index col to reindex var"
        target_vars = utils.read_delimited_file(args.reindexvar)
        logging.info(
            f"Read {args.reindexvar} for {len(target_vars)} vars to reindex")
        adata = adata_utils.reindex_adata_vars(adata, target_vars)

    adata.X = csr_matrix(adata.X)
    logging.info(f"Writing to {args.out_h5ad}")
    adata.write_h5ad(args.out_h5ad, compression=None)
def pretrainFolder(folder,
                   species_list,
                   data_type_list=None,
                   out_dir=".",
                   initial_file="",
                   n_mouse=21122,
                   n_human=21183,
                   n_shared=15494,
                   batch_size=100,
                   pretrain_kwargs={}):

    mtx_files = [
        y for x in os.walk(folder) for y in glob(os.path.join(x[0], '*.mtx'))
    ]
    nonmissing_files = [
        y for x in os.walk(folder)
        for y in glob(os.path.join(x[0], '*nonmissing.txt'))
    ]
    if data_type_list is None:
        data_type_list = ['UMI'] * len(mtx_files)
    if len(species_list) == 1:
        species_list = species_list * len(mtx_files)

    idx = np.arange(len(mtx_files))
    np.random.seed(42)
    np.random.shuffle(idx)

    nonmissing_indicator_list = []

    for f in nonmissing_files:
        nonmissing_indicator_list.append(np.loadtxt(f))

    data_list = []
    for ff in mtx_files:
        print(ff)
        data_list.append(anndata.read_mtx(ff).transpose())
    print(species_list)
    print(data_type_list)

    for i in range(len(mtx_files)):
        data_list[i].uns['species'] = species_list[i]
        print(species_list[i])
        data_list[i].uns['data_type'] = data_type_list[i]
        print(data_type_list[i])

    result = SaverXTrain(data_list,
                         n_human,
                         n_mouse,
                         n_shared,
                         out_dir=out_dir,
                         nonmissing_indicator_list=nonmissing_indicator_list,
                         initial_file=initial_file,
                         batch_size=batch_size,
                         **pretrain_kwargs)
def combine_celltypes(data_dir):
    '''Merge the downloaded FACS-sorted data into one
    '''
    celltypes = [
        "b_cells",
        "cd14_monocytes",
        "cd34",
        "cd56_nk",
        "cd4_t_helper",
        "naive_t",
        "memory_t",
        "regulatory_t",  ## CD4+ T cells
        "cytotoxic_t",
        "naive_cytotoxic"
    ]  ## CD8+ T cells
    adata_list = []
    for celltype in celltypes:
        celltype_dir = data_dir + os.sep + 'PBMC_Zheng_FACS' + os.sep + celltype
        adata = anndata.read_mtx(celltype_dir + os.sep + 'matrix.mtx').T
        ## load genes
        genes = pd.read_csv(celltype_dir + os.sep + 'genes.tsv',
                            header=None,
                            sep='\t')
        adata.var['gene_symbols'] = genes[1].values
        adata.var_names = adata.var['gene_symbols']
        adata.var_names_make_unique(join="-")
        ## load cells
        cells = pd.read_csv(celltype_dir + os.sep + 'barcodes.tsv',
                            header=None,
                            sep='\t')
        adata.obs['barcode'] = cells[0].values
        adata.obs_names = cells[0]
        adata.obs_names_make_unique(join="-")
        ## append adata
        adata_list.append(adata)

    final_adata = anndata.AnnData.concatenate(
        *adata_list,
        join='inner',
        batch_key="cell.type",
        batch_categories=celltypes)  #inner
    final_adata.var.index.name = None
    final_adata.obs.index.name = None
    final_adata.write(data_dir + os.sep + 'PBMC_Zheng_FACS/FACS_adata.h5ad')
    return final_adata
def write_brain_adata(data_dir, region="FC"):
    '''Loading data from different brain region and make it an anndata, store it
    @region: FC/HC

    FC: 29463 genes * 194027 cells -> 71,445 cells
    HC: 27953 genes * 134430 cells -> 53,204 cells

    Note: the mtx and genes/barcodes come from RDS file using writeMM and write
    Because the file is too large, I generated h5ad and deleted original data

    data <- readRDS("Hippocampus.RDS")
    writeMM(data, "mouse_HC.mtx")
    write(rownames(data), "mouse_HC_genes.tsv")
    write(colnames(data), "mouse_HC_barcodes.tsv")
    '''
    ## load data as anndata
    adata = anndata.read_mtx(data_dir+os.sep+'Mousebrain/mouse_'+region+'.mtx').T

    ## load cells and genes
    genes = pd.read_csv(data_dir+os.sep+'Mousebrain/mouse_'+region+'_genes.tsv',
            header=None, sep='\t')
    adata.var['gene_symbols'] = genes[0].values
    adata.var_names = adata.var['gene_symbols']
    adata.var_names_make_unique(join="-")

    cells = pd.read_csv(data_dir+os.sep+'Mousebrain/mouse_'+region+'_barcodes.tsv',
            header=None, sep='\t')
    adata.obs['barcode'] = cells[0].values
    adata.obs_names = cells[0]
    adata.obs_names_make_unique(join="-")

    ## load metadata information
    df = pd.read_csv(data_dir+os.sep+"Mousebrain/Mousebrain_metadata.csv", index_col=0)
    df = df[df["mouse_celltypes"] != "unknown"] # remove unknown cell types
    common_barcodes = set(df["barcodes"]).intersection(set(adata.obs["barcode"])) # 53,204 cells
    adata = adata[list(common_barcodes)]

    adata.obs = adata.obs.merge(df, left_on="barcode", right_on="barcodes")
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None
    adata.var.index.name=None

    adata.write(data_dir+os.sep+'Mousebrain/'+region+'_adata.h5ad')
Exemple #10
0
def load_file(filepath):
    t_flag = False
    if filepath == 'default' or filepath == 'datasets/user_uploaded/default':
        filepath = join_root("../datasets/default.csv")
        t_flag = True
    elif filepath == 'test':
        filepath = join_root('../../datasets/server/testdataset.h5ad')

    dataset = os.path.basename(filepath)
    dataset = os.path.splitext(dataset)[0]

    try:
        if filepath[-4:] == 'h5ad':
            adata = anndata.read_h5ad(filepath)
        if filepath[-3:] == 'csv':
            adata = anndata.read_csv(filepath)
            if t_flag:
                adata = adata.T
        if filepath[-4:] == 'xlsx':
            adata = anndata.read_excel(filepath)
        if filepath[-3:] == 'mtx':
            adata = anndata.read_mtx(filepath)
        if filepath[-3:] == 'txt' or filepath[-3:] == 'tab' or filepath[-4:] == 'data':
            adata = anndata.read_text(filepath)
        if filepath[-2:] == 'h5':
            adata = anndata.read_hdf(filepath)
        if filepath[-4:] == 'loom':
            adata = anndata.read_loom(filepath)
    except Exception as e:
        print(str(e))
        raise IncorrectFileFormat(
            "File does not exist or file format is incorrect.")

    # Make sure cluster names are in proper format
    if 'cluster_names' in adata.uns:
        adata.uns['cluster_names'] = bidict(adata.uns['cluster_names'])
        for key in list(adata.uns['cluster_names'].keys()):
            adata.uns['cluster_names'][int(key)] = \
                adata.uns['cluster_names'].pop(key, None)

    adata.uns['dataset'] = dataset
    return adata
def load_PBMC_liger_data(data_dir):
    '''Loading PBMC liger 6k cells data
    '''
    ## load data as anndata
    adata = anndata.read_mtx(data_dir + os.sep + 'PBMC_demuxlet/matrix.mtx').T

    ## load cells and genes
    genes = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/genes.tsv',
                        header=None,
                        sep='\t')
    adata.var['gene_symbols'] = genes[0].values
    adata.var_names = adata.var['gene_symbols']
    # Make sure the gene names are unique
    adata.var_names_make_unique(join="-")

    cells = pd.read_csv(data_dir + os.sep + 'PBMC_demuxlet/barcodes.tsv',
                        header=None,
                        sep='\t')
    adata.obs['barcode'] = cells[0].values
    adata.obs_names = cells[0]
    # Make sure the cell names are unique
    adata.obs_names_make_unique(join="-")
    return adata
Exemple #12
0
### SPLIT SINGLE-CELL DATASET IN GENERATION AND VALIDATION SET ###

import pickle
import random

import anndata
import numpy as np
import pandas as pd

adata_raw = anndata.read_mtx(
    '/nfs/team205/vk7/sanger_projects/large_data/mouse_viseum_snrna/rawdata/all.mtx'
).T

adata_snrna = anndata.read_h5ad(
    "/nfs/team283/ed6/processed_data/visium_st_beta/snRNA_s144600_preprocessed_20200109.h5ad"
)

## Cell type annotations
labels = pd.read_csv(
    '/nfs/team283/ed6/processed_data/visium_st_beta/snRNA_annotation_20200229.csv',
    index_col=0)

# ## Add cell type labels as columns in adata.obs
# adata_snrna = adata_snrna_raw[labels.index,]
# adata_snrna.obs = pd.concat([labels, adata_snrna_raw.obs], axis=1)

# add cell names
obs_id = pd.read_csv(
    '/nfs/team205/vk7/sanger_projects/large_data/mouse_viseum_snrna/rawdata/all_cells.txt'
)
obs = obs_id['cell_id'].str.split(pat="_", expand=True)
Exemple #13
0
def _read(
    filename: Path,
    backed=None,
    sheet=None,
    ext=None,
    delimiter=None,
    first_column_names=None,
    backup_url=None,
    cache=False,
    cache_compression=None,
    suppress_cache_warning=False,
    **kwargs,
):
    if ext is not None and ext not in avail_exts:
        raise ValueError('Please provide one of the available extensions.\n'
                         f'{avail_exts}')
    else:
        ext = is_valid_filename(filename, return_ext=True)
    is_present = _check_datafile_present_and_download(
        filename,
        backup_url=backup_url,
    )
    if not is_present:
        logg.debug(f'... did not find original file {filename}')
    # read hdf5 files
    if ext in {'h5', 'h5ad'}:
        if sheet is None:
            return read_h5ad(filename, backed=backed)
        else:
            logg.debug(f'reading sheet {sheet} from file {filename}')
            return read_hdf(filename, sheet)
    # read other file types
    path_cache = settings.cachedir / _slugify(filename).replace(
        '.' + ext, '.h5ad')  # type: Path
    if path_cache.suffix in {'.gz', '.bz2'}:
        path_cache = path_cache.with_suffix('')
    if cache and path_cache.is_file():
        logg.info(f'... reading from cache file {path_cache}')
        return read_h5ad(path_cache)

    if not is_present:
        raise FileNotFoundError(f'Did not find file {filename}.')
    logg.debug(f'reading {filename}')
    if not cache and not suppress_cache_warning:
        logg.hint('This might be very slow. Consider passing `cache=True`, '
                  'which enables much faster reading from a cache file.')
    # do the actual reading
    if ext == 'xlsx' or ext == 'xls':
        if sheet is None:
            raise ValueError(
                "Provide `sheet` parameter when reading '.xlsx' files.")
        else:
            adata = read_excel(filename, sheet)
    elif ext in {'mtx', 'mtx.gz'}:
        adata = read_mtx(filename)
    elif ext == 'csv':
        adata = read_csv(filename, first_column_names=first_column_names)
    elif ext in {'txt', 'tab', 'data', 'tsv'}:
        if ext == 'data':
            logg.hint(
                "... assuming '.data' means tab or white-space "
                'separated text file', )
            logg.hint('change this by passing `ext` to sc.read')
        adata = read_text(filename, delimiter, first_column_names)
    elif ext == 'soft.gz':
        adata = _read_softgz(filename)
    elif ext == 'loom':
        adata = read_loom(filename=filename, **kwargs)
    else:
        raise ValueError(f'Unknown extension {ext}.')
    if cache:
        logg.info(f'... writing an {settings.file_format_data} '
                  'cache file to speedup reading next time')
        if cache_compression is _empty:
            cache_compression = settings.cache_compression
        if not path_cache.parent.is_dir():
            path_cache.parent.mkdir(parents=True)
        # write for faster reading when calling the next time
        adata.write(path_cache, compression=cache_compression)
    return adata
def load_PBMC_batch2_data(data_dir, condition=None, ind=None):
    '''Loading PBMC batch2 data

    @condition: ctrl/stim
    @ind: 101 1015 1016 1039 107 1244 1256 1488
    '''
    ## load genes
    genes = pd.read_csv(data_dir + os.sep +
                        'PBMC_demuxlet/GSE96583_batch2.genes.tsv',
                        header=None,
                        sep='\t')

    ## load control data
    ctrl_adata = anndata.read_mtx(data_dir + os.sep +
                                  'PBMC_demuxlet/GSM2560248_2.1.mtx').T
    ctrl_adata.var['gene_symbols'] = genes[1].values
    ctrl_adata.var_names = ctrl_adata.var['gene_symbols']
    # Make sure the gene names are unique
    ctrl_adata.var_names_make_unique(join="-")

    cells = pd.read_csv(data_dir + os.sep +
                        'PBMC_demuxlet/GSM2560248_barcodes.tsv',
                        header=None,
                        sep='\t')
    ctrl_adata.obs['barcode'] = 'ctrl' + cells[0].values
    ctrl_adata.obs_names = cells[0]
    # Make sure the cell names are unique
    ctrl_adata.obs_names_make_unique(join="-")

    ## load stim data
    stim_adata = anndata.read_mtx(data_dir + os.sep +
                                  'PBMC_demuxlet/GSM2560249_2.2.mtx').T
    stim_adata.var['gene_symbols'] = genes[1].values
    stim_adata.var_names = stim_adata.var['gene_symbols']
    # Make sure the gene names are unique
    stim_adata.var_names_make_unique(join="-")

    cells = pd.read_csv(data_dir + os.sep +
                        'PBMC_demuxlet/GSM2560249_barcodes.tsv',
                        header=None,
                        sep='\t')
    stim_adata.obs['barcode'] = 'stim' + cells[0].values
    stim_adata.obs_names = cells[0]
    # Make sure the cell names are unique
    stim_adata.obs_names_make_unique(join="-")

    ## combine control/stimulated data together
    adata = ctrl_adata.concatenate(stim_adata,
                                   batch_key="condition",
                                   batch_categories=['control', 'stimulated'])
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None

    ## load meta data information
    PBMC_batch2_df = load_PBMC_batch2_df(data_dir)
    common_barcodes = set(PBMC_batch2_df['barcode']).intersection(
        set(adata.obs['barcode']))
    adata = adata[list(common_barcodes)]

    adata.obs = adata.obs.merge(PBMC_batch2_df,
                                left_on="barcode",
                                right_on="barcode")
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None
    adata.var.index.name = None

    adata.obs.rename(columns={'cell': 'cell.type'}, inplace=True)

    if condition is not None:
        cond_cells = adata.obs[adata.obs["condition"] == condition].index
        adata = adata[cond_cells]

    if ind is not None:
        ind_cells = adata.obs[adata.obs["ind"].isin(ind.split('_'))].index
        adata = adata[ind_cells]
    return adata
def load_PBMC_batch1_data(data_dir, batch=None, ind=None):
    '''Loading PBMC batch1 S1 dataset from W1-3

    @batch: A/B/C
    @ind: 1043 1079 1154 1249 1493 1511 1598 1085
    '''
    ## load batch1 genes
    genes = pd.read_csv(data_dir + os.sep +
                        'PBMC_demuxlet/GSE96583_batch1.genes.tsv',
                        header=None,
                        sep='\t')

    ## load matrix A data
    A_adata = anndata.read_mtx(
        data_dir + os.sep +
        'PBMC_demuxlet/GSM2560245_A.mat').T  # 3639 inviduals
    ## load cells
    A_adata.var['gene_symbols'] = genes[1].values
    A_adata.var_names = A_adata.var['gene_symbols']
    A_adata.var_names_make_unique(join="-")  # make unique

    A_cells = pd.read_csv(data_dir + os.sep +
                          'PBMC_demuxlet/GSM2560245_barcodes.tsv',
                          header=None,
                          sep='\t')
    A_adata.obs['barcode'] = 'A_' + A_cells[0].values
    A_adata.obs_names = A_cells[0]
    A_adata.obs_names_make_unique(join="-")  # make unique

    ## load matrix B data
    B_adata = anndata.read_mtx(
        data_dir + os.sep +
        'PBMC_demuxlet/GSM2560246_B.mat').T  # 4246 inviduals
    ## load cells
    B_adata.var['gene_symbols'] = genes[1].values
    B_adata.var_names = B_adata.var['gene_symbols']
    B_adata.var_names_make_unique(join="-")  # make unique

    B_cells = pd.read_csv(data_dir + os.sep +
                          'PBMC_demuxlet/GSM2560246_barcodes.tsv',
                          header=None,
                          sep='\t')
    B_adata.obs['barcode'] = 'B_' + B_cells[0].values
    B_adata.obs_names = B_cells[0]
    B_adata.obs_names_make_unique(join="-")  # make unique

    ## load matrix C data
    C_adata = anndata.read_mtx(
        data_dir + os.sep +
        'PBMC_demuxlet/GSM2560247_C.mat').T  # 6145 inviduals
    ## load cells
    C_adata.var['gene_symbols'] = genes[1].values
    C_adata.var_names = C_adata.var['gene_symbols']
    C_adata.var_names_make_unique(join="-")  # make unique

    C_cells = pd.read_csv(data_dir + os.sep +
                          'PBMC_demuxlet/GSM2560247_barcodes.tsv',
                          header=None,
                          sep='\t')
    C_adata.obs['barcode'] = 'C_' + C_cells[0].values
    C_adata.obs_names = C_cells[0]
    C_adata.obs_names_make_unique(join="-")  # make unique

    # combine data together
    adata = A_adata.concatenate(B_adata,
                                C_adata,
                                batch_key="batch",
                                batch_categories=['A', 'B', 'C'])
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None

    ## load meta data information
    PBMC_batch1_df = load_PBMC_batch1_df(data_dir)
    common_barcodes = set(PBMC_batch1_df['barcode']).intersection(
        set(adata.obs['barcode']))
    adata = adata[list(common_barcodes)]

    adata.obs = adata.obs.merge(PBMC_batch1_df,
                                on=["barcode", "batch"],
                                how="left")
    adata.obs.index = adata.obs["barcode"]
    adata.obs.index.name = None
    adata.var.index.name = None

    if batch is not None:
        batch_cells = adata.obs[adata.obs['batch'] == batch].index
        adata = adata[batch_cells]

    if ind is not None:
        ind_list = [int(x) for x in ind.split('_')]
        ind_cells = adata.obs[adata.obs["ind"].isin(ind_list)].index
        adata = adata[ind_cells]
    return adata
Exemple #16
0
def autoencode(n_inoutnodes_human,
               n_inoutnodes_mouse,
               shared_size,
               adata = None,
               mtx_file=None,
               pred_adata=None,
               pred_mtx_file = None,
               species = None,
               nonmissing_indicator=None,
               initial_file= "",
               out_dir=".",
               write_output_to_tsv = False,
               save_data = False,
               verbose = True, verbose_sum = True, verbose_fit = 1,
               batch_size = 32,
               data_name = '',
               net_kwargs={},
               training_kwargs={}): ###############

    if adata is None:
        if mtx_file is None:
            print('Either adata or mtx_file should be provided')
            return
        adata = anndata.read_mtx(mtx_file).transpose()
        if data_name == '':
            data_name = re.sub(r'.*/', '', mtx_file)
            data_name = data_name.replace('.mtx', '') + '_'

    assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance'

    ## add other information into AnnData instances
    adata.X = csr_matrix(adata.X)

    if species is not None:
        adata.uns['species'] = species
         
    adata.uns['data_type'] = 'UMI'

    # set seed for reproducibility
    np.random.seed(42)
    tf.random.set_seed(42)


 #   print(type(adata.X))
    adata = read_dataset(adata,
                         transpose=False,
                         test_split=False,
                         verbose = verbose,
                         copy=False)

    if 'X_dca' in adata.obsm_keys():
        filter_min_counts = False
        size_factors=False
        adata.X = csr_matrix(adata.obsm['X_dca'])
        if pred_adata:
            pred_adata.X = csr_matrix(pred_adata.obsm['X_dca'])
    else:
        filter_min_counts = True
        size_factors = True

    adata = normalize(adata,
                      filter_min_counts = filter_min_counts,
                      size_factors=size_factors,
                      logtrans_input=True)
    adata.uns['shared'] = adata.X.tocsc()[:, 0:shared_size].tocsr()

   # print(type(adata.X))

    if pred_adata or pred_mtx_file:
        if pred_adata is None:
            pred_adata = anndata.read_mtx(pred_mtx_file).transpose()
        else:
            pred_adata.X = csr_matrix(pred_adata.X)
        pred_adata.uns['species'] = species
        pred_adata.uns['data_type'] = 'UMI'
        pred_adata = read_dataset(pred_adata,
                transpose=False, verbose = verbose,
                test_split=False)
        pred_adata = normalize(pred_adata,
                size_factors=size_factors,
                logtrans_input=True)
        pred_adata.uns['shared'] = pred_adata.X.tocsc()[:, 0:shared_size].tocsr()




 
    if nonmissing_indicator is None:
        nonmissing_indicator = 1


    net = nj.JointAutoencoder(input_size_human=n_inoutnodes_human,
                           input_size_mouse=n_inoutnodes_mouse,
                           shared_size = shared_size,
                           **net_kwargs)
   

    net.build()

    if (initial_file != ""):
        net.load_weights(initial_file)
        print("Weights loaded from %s!" % initial_file)

    model = tj.train_joint(adata[adata.obs.DCA_split == 'train'], 
            adata.uns['shared'],
            net, 
            output_dir=out_dir, batch_size = batch_size,
            save_weights = True, 
            verbose = verbose, verbose_sum = verbose_sum, verbose_fit = verbose_fit,
            nonmissing_indicator = nonmissing_indicator,
            **training_kwargs)

    model.load_weights("%s/weights.hdf5" % out_dir)



 
    if pred_adata or pred_mtx_file:
        del adata
        res = net.predict(pred_adata, pred_adata.uns['shared'])
        del model,net
        gc.collect()
        pred_adata.obsm['X_dca'] = res['mean_norm']

        if write_output_to_tsv:
            print('Saving files ...')
            write_text_matrix(res['mean_norm'], 
                    os.path.join(out_dir, data_name + 'pred_mean_norm.tsv'))

        if save_data:
            with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'), 'wb') as f:
                pickle.dump(pred_adata, f, protocol=4)
                f.close()

        return pred_adata


    res = net.predict(adata, adata.uns['shared'])
    del model,net
    gc.collect()

    adata.obsm['X_dca'] = res['mean_norm']


    if write_output_to_tsv:
        print('Saving files ...')
        write_text_matrix(res['mean_norm'], 
            os.path.join(out_dir, data_name + 'mean_norm.tsv'))
         #   write_text_matrix(res['dispersion'], 
         #       os.path.join(out_dir, data_name + 'dispersion.tsv'))
    if save_data:
        with open(os.path.join(out_dir, data_name + 'adata.pickle'), 'wb') as f:
            pickle.dump(adata, f, protocol=4)
            f.close()

 

    return adata
Exemple #17
0
def load_shareseq_data(tissue: str,
                       dirname: str,
                       mode: str = "RNA") -> AnnData:
    """Load the SHAREseq data"""
    assert os.path.isdir(dirname)
    atac_fname_dict = {
        "skin": [
            "GSM4156597_skin.late.anagen.barcodes.txt.gz",
            "GSM4156597_skin.late.anagen.counts.txt.gz",
            "GSM4156597_skin.late.anagen.peaks.bed.gz",
        ],
        "brain": [
            "GSM4156599_brain.barcodes.txt.gz",
            "GSM4156599_brain.counts.txt.gz",
            "GSM4156599_brain.peaks.bed.gz",
        ],
        "lung": [
            "GSM4156600_lung.barcodes.txt.gz",
            "GSM4156600_lung.counts.txt.gz",
            "GSM4156600_lung.peaks.bed.gz",
        ],
    }
    rna_fname_dict = {
        "skin": "GSM4156608_skin.late.anagen.rna.counts.txt.gz",
        "brain": "GSM4156610_brain.rna.counts.txt.gz",
        "lung": "GSM4156611_lung.rna.counts.txt.gz",
    }
    assert atac_fname_dict.keys() == rna_fname_dict.keys()
    assert tissue in atac_fname_dict.keys(), f"Unrecognized tissue: {tissue}"

    atac_barcodes_fname, atac_counts_fname, atac_peaks_fname = atac_fname_dict[
        tissue]
    assert "barcodes" in atac_barcodes_fname  # Check fnames are unpacked correctly
    assert "counts" in atac_counts_fname
    assert "peaks" in atac_peaks_fname
    atac_cell_barcodes = pd.read_csv(
        os.path.join(dirname, atac_barcodes_fname),
        delimiter="\t",
        index_col=0,
        header=None,
    )
    atac_cell_barcodes.index = [
        i.replace(",", ".") for i in atac_cell_barcodes.index
    ]

    # Load in RNA data
    if mode == "RNA":
        retval = ad.read_text(os.path.join(dirname, rna_fname_dict[tissue])).T
        # Ensure that we return a sparse matrix as the underlying datatype
        retval.X = scipy.sparse.csr_matrix(retval.X)
        # Fix formatting of obs names where commas were used for periods
        retval.obs.index = [i.replace(",", ".") for i in retval.obs.index]
        intersected_barcodes = [
            bc for bc in retval.obs_names
            if bc in set(atac_cell_barcodes.index)
        ]
        assert intersected_barcodes, f"No common barcodes between RNA/ATAC for {tissue}"
        logging.info(
            f"RNA {tissue} intersects {len(intersected_barcodes)}/{len(retval.obs_names)} barcodes with ATAC"
        )
        retval = retval[intersected_barcodes]

    elif mode == "ATAC":
        # Load in ATAC data
        # read_mtx automatically gives us a sparse matrix
        retval = ad.read_mtx(os.path.join(dirname, atac_counts_fname)).T
        # Attach metadata
        retval.obs = atac_cell_barcodes
        atac_peaks = pd.read_csv(
            os.path.join(dirname, atac_peaks_fname),
            delimiter="\t",
            header=None,
            names=["chrom", "start", "end"],
        )
        atac_peaks.index = [
            f"{c}:{s}-{e}" for _i, c, s, e in atac_peaks.itertuples()
        ]
        retval.var = atac_peaks
    else:
        raise ValueError("mode must be either RNA or ATAC")
    assert isinstance(retval.X, scipy.sparse.csr_matrix)
    return retval
Exemple #18
0
def autoencode(
        adata=None,
        mtx_file=None,
        pred_adata=None,  ## cross-validation purpose
        pred_mtx_file=None,
        out_dir=".",
        write_output_to_tsv=False,
        save_data=False,
        verbose=True,
        verbose_sum=True,
        verbose_fit=1,
        batch_size=32,
        data_name="",
        nonmissing_indicator=None,
        net_kwargs={}):  ###############

    if adata is None:
        if mtx_file is None:
            print('Either adata or mtx_file should be provided')
            return
        adata = anndata.read_mtx(mtx_file).transpose()
        if data_name == "":
            data_name = re.sub(r'.*/', '', mtx_file)
            data_name = data_name.replace('.mtx', '') + '_'

    assert isinstance(adata,
                      anndata.AnnData), 'adata must be an AnnData instance'

    adata.uns['data_type'] = 'UMI'

    # set seed for reproducibility
    np.random.seed(42)
    tf.random.set_seed(42)

    adata = read_dataset(adata,
                         transpose=False,
                         test_split=False,
                         verbose=verbose,
                         copy=False)

    adata = normalize(adata,
                      filter_min_counts=True,
                      size_factors=True,
                      logtrans_input=True)

    if pred_adata or pred_mtx_file:
        if pred_adata is None:
            pred_adata = anndata.read_mtx(pred_mtx_file).transpose()
        pred_adata.uns['data_type'] = 'UMI'
        pred_adata = read_dataset(pred_adata,
                                  transpose=False,
                                  test_split=False,
                                  verbose=verbose,
                                  copy=False)
        pred_adata = normalize(pred_adata,
                               size_factors=True,
                               logtrans_input=True)

    net = NBConstantDispAutoencoder(input_size=adata.n_vars,
                                    nonmissing_indicator=nonmissing_indicator,
                                    **net_kwargs)
    net.build()

    loss = train(adata[adata.obs.DCA_split == 'train'],
                 net,
                 output_dir=out_dir,
                 batch_size=batch_size,
                 save_weights=True,
                 nonmissing_indicator=nonmissing_indicator,
                 verbose=verbose,
                 verbose_sum=verbose_sum,
                 verbose_fit=verbose_fit)

    net.load_weights("%s/weights.hdf5" % out_dir)

    if pred_adata or pred_mtx_file:
        del adata
        res = net.predict(pred_adata)
        pred_adata.obsm['X_dca'] = res['mean_norm']
        del net, loss
        gc.collect()

        if write_output_to_tsv:
            print('Saving files ...')
            write_text_matrix(
                res['mean_norm'],
                os.path.join(out_dir, data_name + 'pred_mean_norm.tsv'))

        if save_data:
            with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'),
                      'wb') as f:
                pickle.dump(pred_adata, f, protocol=4)
                f.close()
        return pred_adata

    res = net.predict(adata)
    adata.obsm['X_dca'] = res['mean_norm']
    adata.var['X_dca_dispersion'] = res['dispersion']

    if write_output_to_tsv:
        print('Saving files ...')
        write_text_matrix(res['mean_norm'],
                          os.path.join(out_dir, data_name + 'mean_norm.tsv'))
        write_text_matrix(res['dispersion'],
                          os.path.join(out_dir, data_name + 'dispersion.tsv'))
    if save_data:
        with open(os.path.join(out_dir, data_name + 'adata.pickle'),
                  'wb') as f:
            pickle.dump(adata, f, protocol=4)
            f.close()

    del net, loss
    gc.collect()

    return adata
Exemple #19
0
nonmissing_indicator = 1,
out_dir = '../data/10X_pbmc_filtered'
batch_size = 261
write_output_to_tsv = False

save_data = True
verbose = True
verbose_sum = True
verbose_fit = 1
seed = 1
data_name = ""

curve = np.loadtxt(curve_file_name)
print(curve)

adata = anndata.read_mtx(mtx_file).transpose()

assert isinstance(adata, anndata.AnnData), 'adata must be an AnnData instance'

# set seed for reproducibility
np.random.seed(seed)
tf.set_random_seed(seed)

adata = read_dataset(adata,
                     transpose=False,
                     test_split=False,
                     verbose=verbose,
                     copy=False)

pred_adata = anndata.read_mtx(pred_mtx_file).transpose()
pred_adata = read_dataset(pred_adata,
def load_mouseprotocol_adata(data_dir, exp=None, protocol=None, curate=False):
    '''load Mouse cortex data from different protocols

    After comparing between count.umis.txt and count.reads.txt, although the
    same dimension, there are entries different with each other

    - Extract out plate-based cells from count.reads.txt
    - Extract out droplet-based cells from counts.umi.txt
    - Concatenate to anndata

    @exp: cortex1, cortex2
    @protocol: plate-based (Smart-seq2), droplet-based(DroNc-seq, sci-RNA-seq,
    10x Chromium)
    @curate: whether to curate for cell types
    '''

    plate_protocols = ["Smart-seq2"]

    metadata_df = pd.read_csv(data_dir + os.sep +
                              "Mousecortex_protocols/metadata.txt",
                              header=0,
                              sep="\t")
    metadata_df = metadata_df[metadata_df["CellType"] !=
                              "Unassigned"]  ## remove unassigned

    ## they used the same cells and genes indicator
    cells = pd.read_csv(data_dir + os.sep +
                        "Mousecortex_protocols/cell.names.new.txt",
                        header=None)
    genes = pd.read_csv(data_dir + os.sep +
                        "Mousecortex_protocols/genes.counts.txt",
                        header=None)

    ## plate-based data
    read_adata = anndata.read_mtx(data_dir + os.sep +
                                  "Mousecortex_protocols/count.reads.txt").T
    read_adata.var['gene_symbols'] = [x.split('_')[1] for x in genes[0].values]
    read_adata.var_names = read_adata.var['gene_symbols']
    read_adata.var_names_make_unique(join="-")  # make unique
    read_adata.var_names.name = None

    read_adata.obs['barcode'] = cells[0].values
    read_adata.obs_names = read_adata.obs['barcode']
    read_adata.obs_names_make_unique(join="-")  ## make unique
    read_adata.obs_names.name = None

    plate_metadata = metadata_df[metadata_df['Method'].isin(plate_protocols)]
    common_cells = set(plate_metadata['NAME']).intersection(
        set(read_adata.obs_names))
    common_cells = list(common_cells)
    read_adata = read_adata[common_cells]

    obs_df = read_adata.obs.merge(plate_metadata,
                                  how='left',
                                  left_index=True,
                                  right_on='NAME')
    obs_df.index = obs_df['barcode'].values
    read_adata.obs = obs_df

    ## umi-based data
    umi_adata = anndata.read_mtx(data_dir + os.sep +
                                 "Mousecortex_protocols/count.umis.txt").T
    umi_adata.var['gene_symbols'] = [x.split('_')[1] for x in genes[0].values]
    umi_adata.var_names = umi_adata.var['gene_symbols']
    umi_adata.var_names_make_unique(join="-")  # make unique
    umi_adata.var_names.name = None

    umi_adata.obs['barcode'] = cells[0].values
    umi_adata.obs_names = umi_adata.obs['barcode']
    umi_adata.obs_names_make_unique(join="-")  ## make unique
    umi_adata.obs_names.name = None

    droplet_metadata = metadata_df[~metadata_df['Method'].isin(plate_protocols
                                                               )]
    common_cells = set(droplet_metadata['NAME']).intersection(
        set(umi_adata.obs_names))
    common_cells = list(common_cells)
    umi_adata = umi_adata[common_cells]

    obs_df = umi_adata.obs.merge(droplet_metadata,
                                 how='left',
                                 left_index=True,
                                 right_on='NAME')
    obs_df.index = obs_df['barcode'].values
    umi_adata.obs = obs_df

    ## concatenate adata together
    adata = read_adata.concatenate(umi_adata,
                                   batch_key="protocol_type",
                                   batch_categories=['plate', 'droplet'])
    adata.obs.rename(columns={'CellType': 'cell.type'}, inplace=True)

    adata_obs = adata.obs
    adata_obs['Method'].replace(['10x Chromium'], '10x', inplace=True)
    adata.obs = adata_obs

    if exp is not None:
        exp_cells = adata.obs[adata.obs['Experiment'] == exp].index
        adata = adata[exp_cells]

    if protocol is not None:
        proc_cells = adata.obs[adata.obs['Method'] == protocol].index
        adata = adata[proc_cells]

    if curate:
        adata_obs = adata.obs
        adata_obs["cell.type"].replace(['Astrocyte'],
                                       'Astrocytes',
                                       inplace=True)
        adata_obs["cell.type"].replace(['Excitatory neuron'],
                                       'Neuron',
                                       inplace=True)
        adata_obs["cell.type"].replace(['Inhibitory neuron'],
                                       'Interneuron',
                                       inplace=True)
        adata_obs["cell.type"].replace(['Oligodendrocyte'],
                                       'Oligodendrocytes',
                                       inplace=True)
        adata_obs["cell.type"].replace(['OPC'],
                                       'Polydendrocytes',
                                       inplace=True)
        #adata_obs["cell.type"].replace(['Pericyte'], 'Mural', inplace=True) ## seems not the same cell types
        ## Endothelial and Microglia as it is
        adata.obs = adata_obs

    return adata
Exemple #21
0
def autoencode(
        adata=None,
        curve_file_name=None,
        mtx_file=None,
        pred_adata=None,  ## cross-validation purpose
        pred_mtx_file=None,
        out_dir=".",
        write_output_to_tsv=False,
        save_data=False,
        verbose=True,
        verbose_sum=True,
        verbose_fit=1,
        batch_size=32,
        seed=1,
        data_name="",
        nonmissing_indicator=None):  ###############

    print(out_dir)

    curve = np.loadtxt(curve_file_name)
    print(curve)

    if adata is None:
        if mtx_file is None:
            print('Either adata or mtx_file should be provided')
            return
        adata = anndata.read_mtx(mtx_file).transpose()
        if data_name == "":
            data_name = re.sub(r'.*/', '', mtx_file)
            data_name = data_name.replace('.mtx', '') + '_'

    assert isinstance(adata,
                      anndata.AnnData), 'adata must be an AnnData instance'

    adata.uns['data_type'] = 'UMI'

    # set seed for reproducibility
    np.random.seed(seed)
    tf.set_random_seed(seed)

    adata = read_dataset(adata,
                         transpose=False,
                         test_split=False,
                         verbose=verbose,
                         copy=False)

    adata.raw = adata
    if pred_adata or pred_mtx_file:
        if pred_adata is None:
            pred_adata = anndata.read_mtx(pred_mtx_file).transpose()
        pred_adata.uns['data_type'] = 'UMI'
        pred_adata = read_dataset(pred_adata,
                                  transpose=False,
                                  test_split=False,
                                  verbose=verbose,
                                  copy=False)

    tmpX = adata.X.A
    tmpX = tf.convert_to_tensor(tmpX, dtype=np.float32)
    curve = tf.cast(curve, tf.float32)
    pi = PiAct(curve[1] * K.exp(curve[0] - K.exp(curve[2]) * tmpX))
    net = DecayModelAutoencoder(curve=curve,
                                pi=pi,
                                input_size=adata.n_vars,
                                nonmissing_indicator=nonmissing_indicator)
    net.build()
    print("going into training..")

    loss = train(adata[adata.obs.DCA_split == 'train'],
                 net,
                 output_dir=out_dir,
                 batch_size=batch_size,
                 save_weights=True,
                 nonmissing_indicator=nonmissing_indicator,
                 verbose=verbose,
                 verbose_sum=verbose_sum,
                 verbose_fit=verbose_fit)

    net.load_weights("%s/weights.hdf5" % out_dir)

    if pred_adata or pred_mtx_file:
        del adata
        res = net.predict(pred_adata)
        output_dispersion = res['dispersion']
        output_mean = res['mean_norm']
        outputmean_tensor = tf.convert_to_tensor(output_mean)
        output_pi_tensor = PiAct(
            curve[1] * K.exp(curve[0] - K.exp(curve[2]) * outputmean_tensor))
        output_pi = (tf.Session().run(output_pi_tensor))
        del net, loss
        gc.collect()

        if write_output_to_tsv:
            print('Saving files ...')
            write_text_matrix(
                res['mean_norm'],
                os.path.join(out_dir, data_name + 'pred_mean_norm.tsv'))

        if save_data:
            with open(os.path.join(out_dir, data_name + 'pred_adata.pickle'),
                      'wb') as f:
                pickle.dump(pred_adata, f, protocol=4)
                f.close()
        return output_mean, output_dispersion, output_pi

    res = net.predict(adata)
    output_dispersion = res['dispersion']
    output_mean = res['mean_norm']
    output_pi = res['pi']
    outputmean_tensor = tf.convert_to_tensor(output_mean)
    output_pi_tensor = PiAct(
        curve[1] * K.exp(curve[0] - K.exp(curve[2]) * outputmean_tensor))
    output_pi = (tf.Session().run(output_pi_tensor))

    if write_output_to_tsv:
        print('Saving files ...')
        write_text_matrix(res['mean_norm'],
                          os.path.join(out_dir, data_name + 'mean_norm.tsv'))
        write_text_matrix(res['dispersion'],
                          os.path.join(out_dir, data_name + 'dispersion.tsv'))
    if save_data:
        with open(os.path.join(out_dir, data_name + 'adata.pickle'),
                  'wb') as f:
            pickle.dump(adata, f, protocol=4)
            f.close()

    del net, loss
    gc.collect()

    return output_mean, output_dispersion, output_pi
Exemple #22
0
def load_PBMC_protocols_data(data_dir, exp=None, protocol=None, protocol_type=None, curate=False):
    '''load PBMC from different protocols

    After comparing between cells.read.new.txt and cells.umi.new.txt, the cell names
    remain as cells.umi.new.txt. Therefore, compare the count matrix and check reads.
    Turn out to be umi and read count are not the same..

    - Extract out plate-based cells from counts.read.txt
    - Extract out droplet-based cells from counts.umi.txt
    - Concatenate to anndata

    @ exp: pbmc1 (froze) and pbmc2 (fresh)
    @ protocol: plate-based (Smart-Seq2/CEL-Seq2), droplet-based (10X v2, 10X v3,
    Drop-seq, Seq-Well, inDrops)
    @ protocol_type: plate or droplet
    '''
    plate_protocols = ["CEL-Seq2", "Smart-seq2"]
    metadata_df = pd.read_csv(data_dir+os.sep+"PBMC_protocols/metadata.txt", 
            header=0, sep="\t")
    if curate:
        metadata_df["CellType"].replace(["B cell"], 'B cells', inplace=True)
        metadata_df["CellType"].replace(["CD14+ monocyte"], 'CD14+ Monocytes', inplace=True)
        metadata_df["CellType"].replace(["CD4+ T cell"], 'CD4 T cells', inplace=True)
        metadata_df["CellType"].replace(["Cytotoxic T cell"], 'CD8 T cells', inplace=True)
        metadata_df["CellType"].replace(["Natural killer cell"], 'NK cells', inplace=True)

    ## plate-based data
    read_adata = anndata.read_mtx(data_dir+os.sep+"PBMC_protocols/counts.read.txt").T
    read_cells = pd.read_csv(data_dir+os.sep+"PBMC_protocols/cells.read.new.txt",
            header=None)
    read_genes = pd.read_csv(data_dir+os.sep+"PBMC_protocols/genes.read.txt",
            header=None)
    read_adata.var['gene_symbols'] = [x.split('_')[1] for x in read_genes[0].values]
    read_adata.var_names = read_adata.var['gene_symbols']
    read_adata.var_names_make_unique(join="-") # make unique
    read_adata.var_names.name = None
    read_adata.obs['barcode'] = read_cells[0].values
    read_adata.obs_names = read_adata.obs['barcode']
    read_adata.obs_names_make_unique(join="-") ## make unique
    read_adata.obs_names.name = None
    plate_metadata = metadata_df[metadata_df['Method'].isin(plate_protocols)]
    common_cells = set(plate_metadata['NAME']).intersection(set(read_adata.obs_names))
    common_cells = list(common_cells)
    read_adata = read_adata[common_cells] # 1052 cells
    obs_df = read_adata.obs.merge(plate_metadata, how='left', 
            left_index=True, right_on='NAME')
    obs_df.index = obs_df['barcode'].values
    read_adata.obs = obs_df

    ## umi-based data
    umi_adata = anndata.read_mtx(data_dir+os.sep+"PBMC_protocols/counts.umi.txt").T
    umi_cells = pd.read_csv(data_dir+os.sep+"PBMC_protocols/cells.umi.new.txt",
            header=None)
    umi_genes = pd.read_csv(data_dir+os.sep+"PBMC_protocols/genes.umi.txt",
            header=None)
    umi_adata.var['gene_symbols'] = [x.split('_')[1] for x in umi_genes[0].values]
    umi_adata.var_names = umi_adata.var['gene_symbols']
    umi_adata.var_names_make_unique(join="-") # make unique
    umi_adata.var_names.name = None
    umi_adata.obs['barcode'] = umi_cells[0].values
    umi_adata.obs_names = umi_adata.obs['barcode']
    umi_adata.obs_names_make_unique(join="-") ## make unique
    umi_adata.obs_names.name = None
    droplet_metadata = metadata_df[~metadata_df['Method'].isin(plate_protocols)]
    common_cells = set(droplet_metadata['NAME']).intersection(set(umi_adata.obs_names))
    common_cells = list(common_cells)
    umi_adata = umi_adata[common_cells] # 29969 cells
    obs_df = umi_adata.obs.merge(droplet_metadata, how='left', 
            left_index=True, right_on='NAME')
    obs_df.index = obs_df['barcode'].values
    umi_adata.obs = obs_df

    ## concatenate adata together
    adata = read_adata.concatenate(umi_adata, batch_key="protocol_type", 
            batch_categories=['plate', 'droplet'])
    adata.obs.rename(columns={'CellType': 'cell.type'}, inplace=True)

    adata_obs = adata.obs
    adata_obs['Method'].replace(['10x Chromium (v2)', '10x Chromium (v2) A',
        '10x Chromium (v2) B'], '10x-v2', inplace=True)
    adata_obs['Method'].replace(['10x Chromium (v3)'], '10x-v3', inplace=True)
    adata.obs = adata_obs

    if exp is not None:
        exp_cells = adata.obs[adata.obs['Experiment'].isin(exp.split('_'))].index
        adata = adata[exp_cells]

    if protocol is not None:
        prot_cells = adata.obs[adata.obs['Method'].isin(protocol.split('_'))].index
        adata = adata[prot_cells]

    if protocol_type is not None:
        prot_type_cells = adata.obs[adata.obs['protocol_type'] == protocol_type].index
        adata = adata[prot_type_cells]

    return adata