def mtx_to_scanpy(url, tmpdir):
    """
    Build a scanpy object with the .mtx files from an ebi scAtlas endpoint.
    Caution, creates a tmp directory in ./tmp then deletes the
    contents right afterwards.
    :param url: single cell expression atlas url
    :param tmpdir: path for temporary write of *mtx* contents.
    :return: scanpy object with .X and .obs_names and var_names filled.
    """
    opened_url = open_url(url)
    # Dump the zipped files into a temporary folder.
    zipped = zipfile.ZipFile(BytesIO(opened_url.read()))
    os.mkdir(tmpdir)
    zipped.extractall(tmpdir)

    # Grab the three filenames for access.
    filenames = zipped.namelist()

    # Get all the ./tmp file names
    mtxfile = [f for f in filenames if f.endswith(".mtx")][0]
    mtxfile = os.path.join(tmpdir, mtxfile)
    colsfile = [f for f in filenames if f.endswith(".mtx_cols")][0]
    colsfile = os.path.join(tmpdir, colsfile)
    rowsfile = [f for f in filenames if f.endswith(".mtx_rows")][0]
    rowsfile = os.path.join(tmpdir, rowsfile)

    # Fill the anndata object.
    anndata = sc.read_mtx(mtxfile).transpose()
    anndata.obs_names = pd.read_csv(colsfile, header=None, sep="\t")[1]
    anndata.var_names = pd.read_csv(rowsfile, header=None, sep="\t")[1]

    # Empty out the tmp dir...
    rmtree(tmpdir)
    return anndata
Esempio n. 2
0
def scanpy_load_alevin_mtx(analysis_dir, *, force_var_names=None, force_obs_names=None):
    analysis_dir = Path(analysis_dir)
    quant_dir = analysis_dir / 'alevin'
    alevin = scanpy.read_mtx(quant_dir / 'quants_mat.mtx.gz')
    alevin_vars = pandas.read_csv(quant_dir / 'quants_mat_cols.txt', header=None).values.T
    alevin_obs = pandas.read_csv(quant_dir / 'quants_mat_rows.txt', header=None).values.T
    alevin.obs_names = alevin_obs[0]
    alevin.var_names = alevin_vars[0]

    alevin_df = alevin.to_df()
    if force_obs_names is not None:
        alevin_df = alevin_df.reindex(force_obs_names).fillna(0)
    else:
        force_obs_names = alevin_obs[0]

    if force_var_names is not None:
        alevin_df = alevin_df.T.reindex(force_var_names).T.fillna(0)
    else:
        force_var_names = alevin_vars[0]

    alevin = anndata.AnnData(X=scipy.sparse.csr_matrix(alevin_df))
    alevin.obs_names = force_obs_names
    alevin.var_names = force_var_names

    alevin.obs['counts'] = alevin.X.sum(axis=1)
    alevin.obs['ngenes'] = numpy.array((alevin.X > 0).sum(axis=1))

    return alevin
Esempio n. 3
0
def read_mtx(path):
    """\
    Read mtx format data folder including: 
    
        * matrix file: e.g. count.mtx or matrix.mtx or their gz format
        * barcode file: e.g. barcode.txt
        * feature file: e.g. feature.txt
        
    Parameters
    ----------
    path
        the path store the mtx files  
        
    Return
    ------
    AnnData
    """
    for filename in glob(path + '/*'):
        if ('count' in filename or 'matrix' in filename
                or 'data' in filename) and ('mtx' in filename):
            adata = sc.read_mtx(filename).T
    for filename in glob(path + '/*'):
        if 'barcode' in filename:
            barcode = pd.read_csv(filename, sep='\t',
                                  header=None).iloc[:, -1].values
            adata.obs = pd.DataFrame(index=barcode)
        if 'gene' in filename or 'peaks' in filename:
            gene = pd.read_csv(filename, sep='\t', header=None).iloc[:,
                                                                     -1].values
            adata.var = pd.DataFrame(index=gene)
        elif 'feature' in filename:
            gene = pd.read_csv(filename, sep='\t', header=None).iloc[:,
                                                                     1].values
            adata.var = pd.DataFrame(index=gene)
    return adata
Esempio n. 4
0
def read_mtx(path):
    """
    Read mtx format data folder including: 
        matrix file: e.g. count.mtx or matrix.mtx
        barcode file: e.g. barcode.txt
        feature file: e.g. feature.txt
    """
    for filename in glob(path + '/*'):
        if ('count' in filename or 'matrix' in filename
                or 'data' in filename) and ('mtx' in filename):
            adata = sc.read_mtx(filename).T
    for filename in glob(path + '/*'):
        if 'barcode' in filename:
            barcode = pd.read_csv(filename, sep='\t',
                                  header=None).iloc[:, -1].values
            print(len(barcode), adata.shape[0])
            if len(barcode) != adata.shape[0]:
                adata = adata.transpose()
            adata.obs = pd.DataFrame(index=barcode)
        if 'gene' in filename or 'peaks' in filename or 'feature' in filename:
            gene = pd.read_csv(filename, sep='\t', header=None).iloc[:,
                                                                     -1].values
            if len(gene) != adata.shape[1]:
                adata = adata.transpose()
            adata.var = pd.DataFrame(index=gene)
    return adata
def read_GSE132044():

    min_library_size = 5000
    min_genes = 1000
    
    folder = data_location+'/Single_Cell/Ding/'
    fname  = data_location+'/Single_Cell/Ding/counts.read.txt'
    
    #Read in and form clusters
    data  = sc.read_mtx(fname)
    data = sc.AnnData(X=data.X.transpose())
    data.uns['min_library_size'] = min_library_size
    data.uns['min_genes'] = min_genes
    data.uns['folder'] = folder
    
    samples = pd.read_csv(folder+"/cells.read.txt", skiprows=0, header=None, sep='\t')
    genes = pd.read_csv(folder+"/genes.read.txt", skiprows=0, header=None, sep='\t')
    genes = [item.split("_")[0] for item in genes[0].values.astype(str)]
    
    name_map = pd.read_csv(folder+"/map.CPM.names.Count.names.txt", sep='\t', index_col=0)
    samples_meta = pd.read_csv(folder+"/meta.txt", sep="\t", index_col=0)
    
    name_map = name_map.merge(samples_meta, how='left', left_index=True, right_index=True)
    samples  = samples.merge(name_map, how='left', left_on=0, right_index=True).set_index(0) 
    
    #The authors do not provide meta data for all samples, presumably they are excluded from the analysis for a good reason
    sel         = ~samples.Method.isnull().values & ~samples.CellType.isnull().values & (np.array(data.X.sum(1)).reshape(-1)>=min_library_size) \
                  & (np.array(data.X.astype(bool).sum(axis=1)).reshape(-1) >= min_genes)

    return None
Esempio n. 6
0
def load_bustools_counts(prefix):
    prefix = str(prefix)
    data = sc.read_mtx(str(prefix) + '.mtx')
    data.obs.index = pd.read_csv(prefix + '.barcodes.txt',
                                 header=None)[0].values
    data.var.index = pd.read_csv(prefix + '.genes.txt', header=None)[0].values

    return data
Esempio n. 7
0
    def add_modality(self, modality: str, file_x: str, file_obs: str = None, file_var: str = None,
                    obs_index: str = None, var_index: str = None, parent_folder: str = "", 
                     transpose_x=False, overwrite=False):
        """
        Given up to 3 matrix files, creates AnnData file and adds it to Multimeasure object

        Parameters
        ----------
        file_x
            Filename for the data matrix itself (as a Matrix Market file).
        file_obs
            Filename for the observation annotation matrix in csv format.
        file_var
            Filename for variables annotation matrix in csv format.
        obs_index
            column label in obs for the column that should be assigned to the index. Optional,
            but required for plotting and some filtering with scanpy.
        var_index
            column label in var for the column that should be assigned to the index. Optional,
            but required for plotting and some filtering with scanpy.
        """

        if modality not in SUPPORTED_MODALITIES:
            raise AttributeError('Unsupported modality. Must be one of ' + str(SUPPORTED_MODALITIES))

        X = sc.read_mtx(os.path.join(parent_folder, file_x))
        if transpose_x:
            X = X.transpose()

        if file_obs:
            obs = pd.read_csv(os.path.join(parent_folder, file_obs))
        else:
            obs = None

        if file_var:
            var = pd.read_csv(os.path.join(parent_folder, file_var))
        else:
            var = None

        if modality in self.measures.keys():
            if not overwrite:
                raise AttributeError("Modality of type: {}, already exist in Multimeasure object".format(modality))
            else:
                logging.warn("Overwriting modality: {}".format(modality))

        X.obs = obs
        X.var = var
        
        if var_index:
            X.var_names = X.var[var_index].tolist()
        if obs_index:
            X.obs_names = X.obs[obs_index].tolist()
        
        self.measures[modality] = X
        
        print("Modality {} added.".format(modality))
def get_matrix(countFile, outdir, barcodes, varlist, groups, annotation, type):
    # load adata
    adata = sc.read_mtx(countFile)

    # set obs.index and add other obs
    obs = pd.read_csv(barcodes, header=None)
    adata.obs.index = obs[0].values
    obs_new = obs[0].str.split("_", n=2, expand=True)
    n_col = obs_new.shape[1]
    if n_col > 1:
        adata.obs['sample'] = obs_new.iloc[:, 0:n_col-1].apply(lambda x: '_'.join(x),
                                                                axis=1).astype('category').values
        adata.obs['barcode'] = obs_new[2].astype('category').values
    else:
        adata.obs['sample'] = 'one_sample'

    # set index
    if type == 'ECs':
        var = pd.read_csv(varlist, sep='\t', header=None, index_col=2)
        var.index = var.index.astype('str').values
        n_col = var.shape[1]
        if n_col > 2:
            var.drop([1]+list(range(3,n_col+1)), axis=1, inplace=True)
        else:
            var.drop([1], axis=1, inplace=True)
        var.columns = ['geneID']
        adata.var = var
    elif type == 'genes':
        var = pd.read_csv(varlist, sep='\t', header=None, index_col=0)
        adata.var.index = var.index.values

    # extend variable df assumes same index
    if annotation is not None and type == 'genes':
        an = pd.read_csv(annotation, sep='\t', header=None, index_col=0)
        adata.var = pd.merge(adata.var, an, left_index=True, right_index=True,
                            how='left')
    elif annotation is not None and type == 'ECs':
        an = pd.read_csv(annotation, sep='\t', header=None, index_col=0)
        adata.var = pd.merge(adata.var, an, left_on='geneID', right_index=True,
                            how='left')

    # add predivined groups to obs, assumes categorical data and same index and header
    group_keys = None
    if groups is not None:
        obs_groups = pd.read_csv(groups, sep="\t")
        obs_groups.set_index(obs_groups.keys()[0], inplace=True)
        group_keys = obs_groups.keys()
        obs_groups = obs_groups[group_keys].astype('category') # assumed category based but maybe make this optional for continues scales
        adata.obs = adata.obs.merge(obs_groups, how='left', left_index=True,
                                    right_index=True)

    # color by gene
    print("\nData before filtering:")
    print(adata)
    return(adata, group_keys)
Esempio n. 9
0
 def _load_bustools_count(self):
     data = sc.read_mtx(self.input_filename + '.mtx')
     data.var = pd.read_csv(self.input_filename + '.genes.txt',
                            sep='\t',
                            header=None,
                            index_col=0)
     data.obs = pd.read_csv(self.input_filename + '.barcodes.txt',
                            sep='\t',
                            header=None,
                            index_col=0)
     return data
Esempio n. 10
0
 def _load(self):
     if self.input_format == 'h5ad':
         return sc.read_h5ad(self.input_filename)
     elif self.input_format == 'loom':
         return sc.read_loom(self.input_filename)
     elif self.input_format == '10x':
         return self._load_10x()
     elif self.input_format == 'mtx' or self.input_format == 'mex':
         return sc.read_mtx(self.input_filename)
     elif self.input_format == 'bustools-count':
         return self._load_bustools_count()
     return None
Esempio n. 11
0
def scanpy_load_kallisto_gene_mtx(analysis_dir, filter_file=None):
    analysis_dir = Path(analysis_dir)
    kallisto = scanpy.read_mtx(analysis_dir / 'gene.mtx')
    kallisto_vars = pandas.read_csv(analysis_dir / 'gene.genes.txt', header=None).values.T
    kallisto_obs = pandas.read_csv(analysis_dir / 'gene.barcodes.txt', header=None).values.T
    kallisto.obs_names = kallisto_obs[0]
    kallisto.var_names = kallisto_vars[0]

    kallisto.obs['counts'] = kallisto.X.sum(axis=1)
    kallisto.obs['ngenes'] = numpy.array((kallisto.X > 0).sum(axis=1))

    return kallisto
Esempio n. 12
0
def read_dropest(dir_path, reorder=True):
    data_matrix = glob.glob(dir_path + "/*.mtx")[0]
    data_genes = glob.glob(dir_path + "/*features.*")[0]
    data_barcodes = glob.glob(dir_path + "/*barcodes.*")[0]
    adata = sc.read_mtx(data_matrix).T
    adata.var.index = pd.read_csv(data_genes, header=None)[0].values
    adata.obs.index = pd.read_csv(data_barcodes, header=None)[0].values
    adata.obs.index.name = 'Cells'
    adata.var.index.name = 'Genes'
    adata = reorder_AnnData(adata, descending=True)
    adata.raw = adata
    return (adata)
Esempio n. 13
0
def scanpy_load_solo_mtx(analysis_dir, mode='filtered'):
    assert mode in ['filtered', 'raw'], 'STAR Solo only produces raw or filtered files'

    analysis_dir = Path(analysis_dir)
    solo_dir = analysis_dir / 'Solo.out' / 'Gene' / mode
    solo = scanpy.read_mtx(solo_dir / 'matrix.mtx').T
    solo_vars = pandas.read_csv(solo_dir / 'features.tsv', header=None, sep='\t').values.T
    solo_obs = pandas.read_csv(solo_dir / 'barcodes.tsv', header=None, sep='\t').values.T
    solo.obs_names = solo_obs[0]
    solo.var_names = solo_vars[0]

    solo.obs['counts'] = solo.X.sum(axis=1)
    solo.obs['ngenes'] = numpy.array((solo.X > 0).sum(axis=1))

    return solo
Esempio n. 14
0
def load_AnnData(file_x: str,
                 file_obs: str = None,
                 file_var: str = None,
                 obs_index: str = None,
                 var_index: str = None,
                 parent_folder: str = "",
                 transpose_x=False):
    """
    Given up to 3 matrix files, creates AnnData file and adds it to Multimeasure object
    Parameters
    ----------
    file_x
        Filename for the data matrix itself (as a Matrix Market file).
    file_obs
        Filename for the observation annotation matrix in csv format.
    file_var
        Filename for variables annotation matrix in csv format.
    obs_index
        column label in obs for the column that should be assigned to the index. Optional,
        but required for plotting and some filtering with scanpy.
    var_index
        column label in var for the column that should be assigned to the index. Optional,
        but required for plotting and some filtering with scanpy.
    """

    X = sc.read_mtx(os.path.join(parent_folder, file_x))
    if transpose_x:
        X = X.transpose()

    if file_obs:
        obs = pd.read_csv(os.path.join(parent_folder, file_obs))
    else:
        obs = None

    if file_var:
        var = pd.read_csv(os.path.join(parent_folder, file_var))
    else:
        var = None

    X.obs = obs
    X.var = var

    if var_index:
        X.var_names = X.var[var_index].tolist()
    if obs_index:
        X.obs_names = X.obs[obs_index].tolist()

    return X
Esempio n. 15
0
def load_alevin(library_names, input_path):
    '''
    Mirrors the functionality of load_inDrops (see below)

    Imports data files generated by Salmon-Alevin, when run with the --dumpMtx option. Specifically, this 
    function will expect files at the following locations:
    /input_path/library_name/alevin/quants_mat.mtx.gz
    /input_path/library_name/alevin/quants_mat_rows.txt
    /input_path/library_name/alevin/quants_mat_cols.txt
    where 'library_names' contains one or more inDrops.py output folders located at the indicated path.
    '''

    # Create a dictionary to hold data
    D = {}
    for j, s in enumerate(library_names):
        D[s] = {}

    # Load counts data, metadata, & convert to AnnData objects
    for s in library_names:

        # Load counts, gene names into AnnData structure
        D[s] = sc.read_mtx(input_path + '/' + s + '/alevin/quants_mat.mtx.gz',
                           dtype='float32')
        D[s].var_names = np.loadtxt(input_path + '/' + s +
                                    '/alevin/quants_mat_cols.txt',
                                    dtype='str')
        D[s].obs['library_id'] = np.tile(s, [D[s].n_obs, 1])
        D[s].uns['library_id'] = s

        # Load cell barcodes into AnnData structure
        cell_bcds = np.loadtxt(input_path + '/' + s +
                               '/alevin/quants_mat_rows.txt',
                               dtype='str')

        # Append library name to each cell barcode to create unique cell IDs
        lib_cell_bcds = []
        for bcd in cell_bcds:
            lib_cell_bcds.append(s + '_' + bcd)
        D[s].obs['unique_cell_id'] = lib_cell_bcds

        # Compute total counts & number of genes per cell
        D[s].obs['n_counts'] = D[s].X.sum(1).A1
        D[s].obs['n_genes'] = D[s].X.astype(bool).sum(axis=1)

    return D
Esempio n. 16
0
def mtx_to_h5ad(args):
    """Converts .mtx files from 10x CellRanger or DropEst to .h5ad"""
    if args.dropest:
        if args.verbose:
            print("Reading DropEst .mtx files from {}".format(args.dir))
        mtx_file = [
            x for x in os.listdir(args.dir) if x.endswith("_counts.mtx")
        ][0]  # get name of .mtx
        # define filenames
        mtx = args.dir + "/" + mtx_file
        genes = args.dir + "/" + mtx_file.split("_counts")[0] + "_features.txt"
        barcodes = args.dir + "/" + mtx_file.split(
            "_counts")[0] + "_barcodes.txt"
        # read files
        a = sc.read_mtx(mtx)  # read matrix
        a = a.T  # transpose DropEst matrix to cells x genes
        g = pd.read_csv(genes, delimiter="\t", header=None)  # read genes
        b = pd.read_csv(barcodes, delimiter="\t", header=None)  # read barcodes
        # add gene and barcode names
        a.obs_names = b[0].values
        a.var_names = g[0].values
        if args.verbose:
            print(
                "Writing counts to {}/{}.h5ad - {} cells and {} genes".format(
                    args.outdir,
                    mtx_file.split("_counts")[0], a.shape[0], a.shape[1]))
        a.write(
            "{}/{}.h5ad".format(args.outdir,
                                mtx_file.split("_counts")[0]),
            compression="gzip",
        )
    else:
        if args.verbose:
            print("Reading 10x CellRanger .mtx files from {}".format(args.dir))
        a = sc.read_10x_mtx(args.dir)
        name = args.dir.split("_gene_bc_matrices")[
            0]  # extract name from 10x directory name
        if args.verbose:
            print(
                "Writing counts to {}/{}.h5ad - {} cells and {} genes".format(
                    args.outdir, name, a.shape[0], a.shape[1]))
        a.write("{}/{}.h5ad".format(args.outdir, name), compression="gzip")
Esempio n. 17
0
def atac(path, ad=False, path_mtx=None, path_genes=None, path_barcodes=None):
    # Import and transpose data
    if ad is True:
        df = sc.read(path_mtx).transpose()
        df = pd.DataFrame(data=df.X, index=df.obs_names, columns=df.var_names)
    else:
        df = sc.read_mtx(path_mtx).transpose()
        barcodes = pd.read_csv(path_barcodes, delimiter="\t", header=None)
        df.obs_names = barcodes[0].values
        genes = pd.read_csv(path_genes, delimiter="\t", header=None)
        df.var_names = genes[0].values
        df = pd.DataFrame(data=df.X.toarray(),
                          index=df.obs_names,
                          columns=df.var_names)

    # Use only cells not discarded by RNAseq filtering
    inds = pd.read_pickle(f"{path}RNAseq_prepped.pkl").index
    df = df.loc[inds]

    # Normalize data
    df[df.columns] = MaxAbsScaler().fit_transform(df[df.columns])

    # Term frequency - Inverse document frequency
    tf_idf = TfidfTransformer(norm=None, sublinear_tf=False)
    transformed = tf_idf.fit_transform(df.values).toarray()
    transformed_scaled = MaxAbsScaler().fit_transform(transformed)
    df = pd.DataFrame(transformed_scaled, index=df.index, columns=df.columns)

    # Select top peaks
    df = df.loc[:, df.columns[(df != 0).sum() >= (
        df[df.columns] != 0).sum().quantile(0.75)]]
    col_vars = df.var(axis=0)
    highly_variable_cols = df.columns[col_vars >= col_vars.quantile(0.75)]
    df = df.loc[:, highly_variable_cols]
    df.to_pickle(f"{path}ATACseq_ae_prepped.pkl")

    return df
    def _read_raw_dataset(self):
        data_path = self._data_path

        matrix_path = next(data_path.glob('*UMI.count.matrix'))
        cells_path = next(data_path.glob('*cell.annotations.txt'))
        genes_path = next(data_path.glob('*gene.annotations.txt'))
        hash_sheet_path = next(data_path.glob('*hashSampleSheet.txt'))
        hash_table_out_path = next(data_path.glob('*hashTable.out.txt'))

        cells = pd.read_table(cells_path,
                              delim_whitespace=True,
                              usecols=[0],
                              index_col=0,
                              names=[None])

        genes = pd.read_table(genes_path,
                              delim_whitespace=True,
                              names=['gene_ids', 'gene_symbols'],
                              index_col='gene_symbols')
        genes.index.name = None

        treatment_map = pd.read_table(hash_sheet_path,
                                      delim_whitespace=True,
                                      names=['treatment', 'umi', 'umi_count'])

        cell_treatment_map = pd.read_table(
            hash_table_out_path,
            delim_whitespace=True,
            names=['sample', 'barcode', 'treatment', 'axis', 'umi_count'])

        matrix = sc.read_mtx(matrix_path)
        dataset = matrix.T

        dataset.obs = cells
        dataset.var = genes

        return dataset, treatment_map, cell_treatment_map
Esempio n. 19
0
import scanpy as sc 
import numpy as np
import pandas as pd

hemberg = sc.read_mtx(
    '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas.mtx')
hemberg_coldata = pd.read_csv(
    '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas_col.csv',
    index_col=0)
hemberg_genes = np.genfromtxt(
    '/tyrone-data/bharris/metaneighbor_protocol_data/pancreas_genes.csv',
    dtype=str)

hemberg = hemberg.T

hemberg.obs = hemberg_coldata

hemberg.var_names = hemberg_genes
hemberg.obs.columns = np.string_(hemberg.obs.columns)
hemberg.write_h5ad(
    '/tyrone-data/bharris/metaneighbor_protocol_data/hemberg.h5ad',
    compression='gzip',
    compression_opts=9)
def seurat_object_to_anndata(file_path_seurat_object, delete_tmp_file=True):
    """
    Convert seurat object into anndata.

    Args:
        file_path_seurat_object (str): File path of seurat object. Seurat object should be saved as Rds format.
        delete_tmp_file (bool): Whether to delete temporary file.
    Returns:
        anndata: anndata object.

    """

    # check file name
    print("input file name: " + file_path_seurat_object)
    if not file_path_seurat_object.lower().endswith(".rds"):
        raise ValueError("Seurat object should be saved as .Rds file")

    # run R script to extract information and make mtx files
    os.makedirs("tmp", exist_ok=True)
    command = f"Rscript {rscript_folder}/seurat_to_mtx.R {file_path_seurat_object}"
    #ret = os.system()
    #if ret == 0:
    #    pass
    #else:
    #    print("Error in R script")

    exec_process(command,
                 message=True,
                 wait_finished=True,
                 return_process=False)

    print("making AnnData ...")

    folder = "./tmp"

    # load data
    mm = sc.read_mtx(os.path.join(folder, "data.mtx"))
    meta = pd.read_csv(os.path.join(folder, "meta_data.csv"), index_col=0)
    meta_dtype = pd.read_csv(os.path.join(folder, "meta_data_dtype.csv"),
                             index_col=0)
    categorical_info = meta_dtype[meta_dtype["dtype"] == "factor"].index.values

    cell_ids = pd.read_csv(os.path.join(folder, "cells_index.csv")).x.values
    variable_ids = pd.read_csv(os.path.join(folder,
                                            "variables_index.csv")).x.values

    if "raw_data.mtx" in os.listdir(folder):
        raw_data = sc.read_mtx(os.path.join(folder, "raw_data.mtx"))
        mat = _constructAnnData(mm, cell_ids, variable_ids, meta,
                                categorical_info, raw_data)
    else:
        mat = _constructAnnData(mm, cell_ids, variable_ids, meta,
                                categorical_info)

    # add variable gene info
    if "var_genes.csv" in os.listdir(folder):
        variable_genes = pd.read_csv(os.path.join(folder,
                                                  "var_genes.csv")).x.values
        mat.var["variable_gene"] = mat.var.index.isin(variable_genes)

    # add color data
    color_df = pd.read_csv("tmp/cluster_color_hex.csv", index_col=0)
    mat.uns["seurat_clusters_colors"] = color_df.colors_hex.values

    # delete temporary files
    if delete_tmp_file:
        shutil.rmtree(folder)

    return mat
Esempio n. 21
0
def cluster(ctx, matrix, outdir, sample, barcodes, genes, n_top, min_genes, min_cells, n_genes_by_counts, pct_counts_mt, exclude_highly_expressed, max_fraction, n_top_genes, max_value, n_neighbors, n_pcs, debug):
    sample_outdir = outdir / sample / '06.cluster'
    sample_outdir.mkdir(parents=True, exist_ok=True)
    os.chdir(sample_outdir)

    logger.info('cluster start!')

    adata_mtx = sc.read_mtx(matrix).T
    obs = pd.read_csv(barcodes, index_col=0, header=None)
    var = pd.read_csv(genes, index_col=0, header=None)
    obs.index.set_names(None, inplace=True)
    var.index.set_names(None, inplace=True)
    adata_mtx.obs = obs
    adata_mtx.var = var

    result_file = sample_outdir / f'{sample}.h5ad'

    adata_mtx.var_names_make_unique()
    [sc.pl.highest_expr_genes(adata_mtx, n_top=n_top, save=f'_{sample}.{filetype}') for filetype in FILETYPE]

    sc.pp.filter_cells(adata_mtx, min_genes=min_genes)
    sc.pp.filter_genes(adata_mtx, min_cells=min_cells)

    adata_mtx.var['mt'] = adata_mtx.var_names.str.upper().str.startswith('MT-')
    sc.pp.calculate_qc_metrics(adata_mtx, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

    [sc.pl.violin(adata_mtx, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'], jitter=0.4, multi_panel=True, save=f'_{sample}.{filetype}') for filetype in FILETYPE]
    [sc.pl.scatter(adata_mtx, x='total_counts', y='pct_counts_mt', save=f'_pct_mt_{sample}.{filetype}') for filetype in FILETYPE]
    [sc.pl.scatter(adata_mtx, x='total_counts', y='n_genes_by_counts', save=f'_n_genes_{sample}.{filetype}') for filetype in FILETYPE]

    adata_mtx = adata_mtx[adata_mtx.obs.n_genes_by_counts < n_genes_by_counts, :]
    adata_mtx = adata_mtx[adata_mtx.obs.pct_counts_mt < pct_counts_mt, :]
    sc.pp.normalize_total(adata_mtx, target_sum=1e6, exclude_highly_expressed=exclude_highly_expressed, max_fraction=max_fraction)
    sc.pp.log1p(adata_mtx)
    sc.pp.highly_variable_genes(adata_mtx, n_top_genes=n_top_genes)

    [sc.pl.highly_variable_genes(adata_mtx, save=f'_{sample}.{filetype}') for filetype in FILETYPE]

    adata_mtx.raw = adata_mtx
    adata_mtx = adata_mtx[:, adata_mtx.var.highly_variable]

    sc.pp.regress_out(adata_mtx, ['total_counts', 'pct_counts_mt'])
    sc.pp.scale(adata_mtx, max_value=max_value)
    sc.tl.pca(adata_mtx, svd_solver='arpack')

    sc.pp.neighbors(adata_mtx, n_neighbors=n_neighbors, n_pcs=n_pcs)
    sc.tl.umap(adata_mtx)
    sc.tl.leiden(adata_mtx)
    sc.tl.louvain(adata_mtx)
    [sc.pl.umap(adata_mtx, color=algo, save=f'_{algo}_{sample}.{filetype}') for filetype in FILETYPE for algo in CLUSTER_ALGORITHM]

    adata_mtx.write(result_file, compression='gzip')

    logger.info('cluster done!')

    stat_info = {
        'visible': {},
        'invisible': {}
    }
    img = {}
    pngs = (sample_outdir / 'figures').rglob('*.png')
    for png in pngs:
        img[png.name] = png.resolve()

    # report
    logger.info('generate report start!')
    Reporter(name='cluster', stat_json=stat_info, outdir=sample_outdir.parent, img=img)
    logger.info('generate report done!')
Esempio n. 22
0
def readConos(inPath):
    from time import time
    from shutil import rmtree
    from scipy.io import mmread
    from os import mkdir, path
    import pandas as pd

    dir_path = "/tmp/conos" + str(int(time()))
    while path.isdir(dir_path):
        dir_path += '2'
    dir_path += '/'
    mkdir(dir_path)

    ro.r('library(conos)')
    ro.r(f'con <- readRDS("{inPath}")')
    ro.r('meta <- function(sobj) {return([email protected])}')
    ro.r('metalist <- lapply(con$samples, meta)')
    ro.r('library(data.table)')
    ro.r('metaM <- do.call(rbind,unname(metalist))')
    ro.r(
        f'saveConosForScanPy(con, output.path="{dir_path}", pseudo.pca=TRUE, pca=TRUE, metadata.df=metaM)'
    )
    gene_df = pd.read_csv(dir_path + "genes.csv")

    metadata = pd.read_csv(dir_path + "metadata.csv")
    metadata.index = metadata.CellId
    del metadata["CellId"]

    embedding_df = pd.read_csv(dir_path + "embedding.csv")
    # Decide between using PCA or pseudo-PCA
    pseudopca_df = pd.read_csv(dir_path + "pseudopca.csv")
    #pca_df = pd.read_csv(dir_path + "pca.csv")

    graph_conn_mtx = mmread(dir_path + "graph_connectivities.mtx")
    graph_dist_mtx = mmread(dir_path + "graph_distances.mtx")

    adata = sc.read_mtx(dir_path + "raw_count_matrix.mtx")

    adata.var_names = gene_df["gene"].values
    adata.obs_names = metadata.index.values

    adata.obs = metadata.copy()

    # Depends on which PCA you loaded
    adata.X_pca = pseudopca_df.values
    adata.obsm['X_pca'] = pseudopca_df.values

    # Name according to embedding you saved
    adata.X_umap = embedding_df.values
    adata.obsm['X_umap'] = embedding_df.values

    adata.uns['neighbors'] = dict(connectivities=graph_conn_mtx.tocsr(),
                                  distances=graph_dist_mtx.tocsr())

    # Assign raw counts to .raw slot, load in normalised counts
    #adata.raw = adata
    #adata_temp = sc.read_mtx(DATA_PATH + "count_matrix.mtx")
    #adata.X = adata_temp.X

    rmtree(dir_path)

    return adata
Esempio n. 23
0
def load_fry(frydir, which_counts={'X' : ['S','A']}, verbose=False):
    """
    
    Parameters:
        frydir - The directory containing the alevin-fry quantification (i.e. the the quant.json file & alevin subdirectory).
        verbose - True if messages (including error messages) should be printed out, False if function should be quiet.
        which_count - Dictionary specifying how a USA mode matrix should be returned or combined into the resulting 
                      output matrix.  If the input is not a USA mode quantification directory, this parameter is ignored
                      and the count matrix is returned in the `X` field of the returned `AnnData` object.  If the input
                      quantification directory contains a USA mode quantification, then there are 3 sub-matrices that can 
                      be referenced in the dictionary; 'U', 'S', 'A' containing, respectively, unspliced, spliced and 
                      ambiguous counts.  The dictionary should have entries of the form `key` (str) : `value` (list[str]).
                      The following constraints apply : there should be one key-value pair with the key `X`, the resulting
                      counts will be returned in the `X` field of the AnnData object. There can be an arbitrary number
                      of other key-value pairs, but each will be returned as a layer of the resulting AnnData object.
                      Within the key-value pairs, the key refers to the layer name that will be given to the combined 
                      count matrix upon output, and the value should be a subset of `['U', 'S', 'A']` that defines 
                      which sub-matrices should be summed.  For example:
                      {'X' : ['S', 'A'], 'unspliced' : ['U']}

                      will result in a return AnnData object where the X field has a matrix in which each entry 
                      corresponds to the summed spliced and ambiguous counts for each gene in each cell, and there
                      is an additional 'unspliced' layer, whose counts are taken directly from the unspliced sub-matrix.

    Returns:
        An AnnData object with X and layers corresponding to the requested `which_counts`, or None if an 
        error is encountered.
    """
    import json
    import os
    import pandas as pd

    # since alevin-fry 0.4.1 the generic "meta_info.json"
    # has been replaced by a more informative name for each
    # sub-command. For quantification, it is "quant.json".
    # we check for both files here, in order.
    meta_info_files = ["quant.json", "meta_info.json"]

    fpath = os.path.sep.join([frydir, meta_info_files[0]])
    # first, check for the new file, if we don't find it, check
    # for the old one.
    if not os.path.exists(fpath):
        if verbose:
            print(f"Did not find a {meta_info_files[0]} file, checking for older {meta_info_files[1]}.")
        fpath = os.path.sep.join([frydir, meta_info_files[1]])
        # if we don't find the old one either, then return None
        if not os.path.exists(fpath):
            if verbose:
                print(f"Found no {meta_info_files[1]} file either; cannot proceed.")
            return None

    # if we got here then we had a valid json file, so 
    # use it to get the number of genes, and if we are 
    # in USA mode or not.
    meta_info = json.load(open(fpath))
    ng = meta_info['num_genes']
    usa_mode = meta_info['usa_mode']

    # if we are in USA mode
    if usa_mode:
        # make sure that num_genes is a multiple of 3
        if ng %3 != 0:
            if verbose:
                print("Found USA mode, but num genes = {ng} is not a multiple of 3; cannot proceed.")
            return None
        # each gene has 3 splicing statuses, so the actual number of distinct 
        # genes is ng/3.
        ng = int(ng/3)
        if verbose:
            print("processing input in USA mode, will return {}".format("+".join(which_counts)))
              
        # make sure which_counts isn't empty
        assert(len(which_counts) > 0)  

        # make sure the specification in which_counts is OK
        if 'X' not in which_counts:
            if verbose:
                print('In USA mode some sub-matrices must be assigned to the \"X\" (default) output.')
            return None
        if verbose:
            print(f"will populate output field X with sum of counts frorm {which_counts['X']}.")

        for k,v in which_counts.items():
            valid_elem = len(set(v) - set(['U', 'S', 'A'])) == 0
            if not valid_elem:
                if verbose:
                    print(f'Found non-USA element in which_count element list \"{v}\" for key \"{k}\"; cannot proceed.')
                return None
            if verbose and (k != 'X'):
                print(f'will combine {v} into output layer {k}.') 

    elif verbose:
        print("Processing input in standard mode, will return processed count (which_count will be ignored).")

    # read the actual input matrix
    af_raw = scanpy.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
    afg = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng]
    # read the gene ids
    afg_df =  pd.DataFrame(afg, columns=["gene_ids"])
    afg_df = afg_df.set_index("gene_ids")
    # and the barcodes
    abc = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ]
    abc_df = pd.DataFrame(abc, columns=["barcodes"])
    abc_df.index = abc_df["barcodes"]
    
    x = af_raw.X
    # if we're not in USA mode, just combine this info into 
    # an AnnData object
    if not usa_mode:
        af = scanpy.AnnData(x.T, var=abc_df, obs=afg_df)
        af = af.T 
    else: # USA mode
        # otherwise, combine the sub-matrices into the output object as 
        # specified by `which_counts`
        rd = {'S' : range(0,ng), 'U' : range(ng, 2*ng), 'A' : range(2*ng,3*ng)}
        xcounts = which_counts['X']
        o = x[:, rd[xcounts[0]]]
        for wc in xcounts[1:]:
            o += x[:, rd[wc]]
        af = scanpy.AnnData(o.T, var=abc_df, obs=afg_df)
        af = af.T

        # now, if there are other layers requested, populate those
        for other_layer in which_counts.keys() - 'X':
            xcounts = which_counts[other_layer]
            o = x[:, rd[xcounts[0]]]
            for wc in xcounts[1:]:
                o += x[:, rd[wc]] 
            af.layers[other_layer] = o
    return af
Esempio n. 24
0
parser.add_argument('--obs')
parser.add_argument('--var')
parser.add_argument('--output')

args = parser.parse_args()

with open(args.params, 'r') as f:
    json_params = json.loads(f.read())

try:
    working_dir = os.getcwd()
    # get the parameter object
    params = json_params.get('parameters').get('input').get('anndata-spec')
    # get the matrix file
    logging.info('Now reading the matrix market file')
    adata = sc.read_mtx(os.path.join(working_dir, args.matrix)).transpose()
    logging.debug('adata variables:')
    logging.info(dir(adata))
    # get the var dataframe
    logging.info('Now reading var dataframe')
    df_var = pd.read_csv(os.path.join(working_dir, args.var),
                         delimiter=params.get('var').get('delimiter'))
    logging.debug('var columns are {}'.format(list(df_var.columns)))
    logging.debug('Now mapping pandas indexes')
    df_var.index = df_var[params.get('var').get('index_col')]
    df_var.index.name = None
    logging.debug('Now mapping var')
    adata.var = df_var
    # get the obs data
    logging.info('Now reading obs dataframe')
    df_obs = pd.read_csv(os.path.join(working_dir, args.obs),
Esempio n. 25
0
def get_matrix(sampleName, spliced_dir, unspliced_dir, prefix):
    spliced_dir = "velocity_quant/" + sampleName + "/" + spliced_dir + "/" #spliced_counts
    unspliced_dir = "velocity_quant/" + sampleName + "/" + unspliced_dir + "/" #unspliced_counts

    # get intersection of genes and barcodes for each sample
    s = scipy.io.mmread(spliced_dir + prefix + ".mtx")
    u = scipy.io.mmread(unspliced_dir + prefix + ".mtx")

    # get intersection of barcodes and perform on s and u
    df_s_bcs = pd.read_csv(spliced_dir + prefix + ".barcodes.txt", header=None)
    df_u_bcs = pd.read_csv(unspliced_dir + prefix + ".barcodes.txt", header=None)
    s_bcs = df_s_bcs[0].values.tolist()
    u_bcs = df_u_bcs[0].values.tolist()
    bcs_is = [i for i in s_bcs if i in u_bcs]
    s_bcs_is_int = [i for i in range(len(s_bcs)) if s_bcs[i] in bcs_is]
    u_bcs_is_int = [i for i in range(len(u_bcs)) if u_bcs[i] in bcs_is]
    s = s.tocsr()[s_bcs_is_int,:]
    u = u.tocsr()[u_bcs_is_int,:]
    s_bcs = df_s_bcs.iloc[s_bcs_is_int,:]
    u_bcs = df_u_bcs.iloc[u_bcs_is_int,:]

    # get intersection of genes and perform on s and u
    df_s_genes = pd.read_csv(spliced_dir + prefix + ".genes.txt", header=None)
    df_u_genes = pd.read_csv(unspliced_dir + prefix + ".genes.txt", header=None)
    s_genes = df_s_genes[0].values.tolist()
    u_genes = df_u_genes[0].values.tolist()
    genes_is = [i for i in s_genes if i in u_genes]
    s_genes_is_int = [i for i in range(len(s_genes)) if s_genes[i] in genes_is]
    u_genes_is_int = [i for i in range(len(u_genes)) if u_genes[i] in genes_is]
    s = s.tocsc()[:,s_genes_is_int]
    u = u.tocsc()[:,u_genes_is_int]
    s_genes = df_s_genes.iloc[s_genes_is_int,:]
    u_genes = df_u_genes.iloc[u_genes_is_int,:]

    # convert back to coo
    s = s.tocoo()
    u = u.tocoo()

    # save intersected matrix, barcodes and genes
    scipy.io.mmwrite(spliced_dir + prefix + "_isect.mtx", s)
    scipy.io.mmwrite(unspliced_dir + prefix + "_isect.mtx", u)
    df_s_bcs.to_csv(spliced_dir + prefix + ".barcodes_isect.txt", header=None, index=False)
    df_u_bcs.to_csv(unspliced_dir + prefix + ".barcodes_isect.txt", header=None, index=False)
    s_genes.to_csv(spliced_dir + prefix + ".genes_isect.txt", header=None, index=False)
    u_genes.to_csv(unspliced_dir + prefix + ".genes_isect.txt", header=None, index=False)

    s = sc.read_mtx(spliced_dir + prefix + "_isect.mtx")
    u = sc.read_mtx(unspliced_dir + prefix + "_isect.mtx")

    print(s_genes)
    print(u_genes)
    print(s_bcs)
    print(u_bcs)
    print(s)
    print(u)

    s.obs.index = s_bcs[0].values
    u.obs.index = u_bcs[0].values

    s.var.index = s_genes[0].values
    u.var.index = u_genes[0].values


    s_bcs["sample"] = sampleName
    u_bcs["sample"] = sampleName

    s_bcs.columns = ["bcs", "sample"]
    u_bcs.columns = ["bcs", "sample"]

    s_bcs.index = s_bcs["bcs"] + "." + s_bcs["sample"]
    u_bcs.index = u_bcs["bcs"] + "." + u_bcs["sample"]

    out = {'s': s,
           'u': u,
           's_bcs': s_bcs,
           'u_bcs': u_bcs,
           'genes': s_genes}
    return(out)
gene_names['gene_name'] = gene_names['extra'].str.extract(
    pat='gene_name "(.*?)";')
gene_names['gene_id'] = gene_names['extra'].str.extract(pat='gene_id "(.*?)";')
gene_names['transcript_type'] = gene_names['extra'].str.extract(
    pat='transcript_type "(.*?)";')
gene_names = gene_names[gene_names['feature_type'] == 'gene']

annotation = gene_names[['gene_id', 'gene_name',
                         'chr']].groupby('gene_name').head(1)
annotation['gene_name'] = annotation['gene_name'].str.upper()
annotation = annotation.set_index('gene_id')

#%% Read and log normalize single cell data

adata = sc.read_mtx(
    'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx'
)
data = adata.X.toarray().T
adata = sc.AnnData(data)

cols = pd.read_csv(
    'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_cols',
    header=None)
rows = pd.read_csv(
    'data/raw-data/E-CURD-9/E-CURD-9.aggregated_filtered_normalised_counts.mtx_rows',
    sep='\t',
    header=None)
adata.var = rows.set_index(0).join(annotation)[[
    'gene_name'
]].reset_index().set_index('gene_name')
adata.var.index = adata.var.index.astype('str')
import os

# Processing alevin-fry count matrix
frydir = "pancreas_quant_res"
e2n_path = "data/geneid_to_name.txt"
meta_info = json.load(open(os.path.sep.join([frydir, "meta_info.json"])))
ng = meta_info['num_genes']
usa_mode = meta_info['usa_mode']

if usa_mode:
    print("processing input in USA mode, will return A+S as the spliced count, and U as the unspliced count")
else:
    print("please follow previous steps to generate the ount matrix in the USA mode")
    assert(False)

af_raw = sc.read_mtx(os.path.sep.join([frydir, "alevin", "quants_mat.mtx"]))
ng = int(ng/3)
e2n = dict([ l.rstrip().split() for l in open(e2n_path).readlines()])
var_names = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_cols.txt"])).readlines()][:ng]
var_names = [e2n[e] for e in var_names]

obs_names = [ l.rstrip() for l in open(os.path.sep.join([frydir, "alevin", "quants_mat_rows.txt"])).readlines() ]

x = af_raw.X
spliced = x[:,range(0,ng)] + x[:,range(2*ng,3*ng)]
unspliced = x[:,range(ng, 2*ng)]

# creating AnnData using spliced and unspliced count matrix
adata = anndata.AnnData(X = spliced, 
                        layers = dict(spliced = spliced, 
                                    unspliced = unspliced))
Esempio n. 28
0
import scanpy as sc 
import numpy as np
import pandas as pd

tasic = sc.read_mtx(
    'tasic_counts.mtx')
tasic_coldata = pd.read_csv(
    'tasic_col.csv',
    index_col=0)
tasic_genes = np.genfromtxt(
    'tasic_genes.csv',
    dtype=str)

tasic = tasic.T

tasic.obs = tasic_coldata

tasic.var_names = tasic_genes
tasic.obs.columns = np.string_(tasic.obs.columns)
tasic.write_h5ad(
    'tasic.h5ad',
    compression='gzip',
    compression_opts=9)
Esempio n. 29
0
                    help='Run DCA?',
                    default=False,
                    action='store_true')

args = parser.parse_args()

print(args)

# ---------
# load data
# ---------

# matrix
if '.mtx' in args.data:
    print('Reading sparse matrix %s' % (args.data))
    x = scanpy.read_mtx(args.data)
else:
    print('Reading dense matrix %s' % (args.data))
    x = pd.read_csv(args.data, sep='\t', header=None, index_col=False)
    x = np.array(x)

# clusters
if args.clusters:
    clusters = pd.read_csv(args.clusters,
                           sep='\t',
                           header=None,
                           index_col=False)
    clusters = np.array(clusters)
    g = 'groups'
else:
    clusters = ''
Esempio n. 30
0
    'font.sans-serif': 'Arial',
    'font.family': 'sans-serif',
    'axes.titlesize': 18,
    'axes.labelsize': 14,
})

#%% Get genes

#note genes for the second mouse are identical, so no need to duplicate
genes = pd.read_csv('data/raw-data/GSM344007/GSM3440071_SC01_genes.tsv',
                    sep='\t',
                    names=['key', 'name'])
genes['name'] = genes['name'].str.upper()

#%% Read and log normalize single cell data
adata01 = sc.read_mtx('data/raw-data/GSM344007/GSM3440071_SC01_matrix.mtx')
adata02 = sc.read_mtx('data/raw-data/GSM344007/GSM3440072_SC02_matrix.mtx')
data01 = adata01.X.transpose()
data02 = adata02.X.transpose()

adata = sc.AnnData(scipy.sparse.vstack((data01, data02), format='csr'))
adata.var = genes.set_index('name')
adata.var_names_make_unique()
print('Healthy Mouse: ', adata.X.max())
sc.pp.log1p(adata, base=2)

#%% Exploratory plots


# Unused in the main analysis, but used to parameter search for filtering cutoff.
def exploratory_plots(adata):