Exemple #1
0
def runIntegration(inPath, outPath, method, hvg, batch, celltype=None):
    """
    params:
        method: name of method
        batch: name of `adata.obs` column of the batch
        max_genes_hvg: maximum number of HVG
    """

    adata = sc.read(inPath)

    if timing:
        if celltype is not None:
            integrated_tmp = scIB.metrics.measureTM(method, adata, batch,
                                                    celltype)
        else:
            integrated_tmp = scIB.metrics.measureTM(method, adata, batch)

        integrated = integrated_tmp[2][0]

        integrated.uns['mem'] = integrated_tmp[0]
        integrated.uns['runtime'] = integrated_tmp[1]

    else:
        if celltype is not None:
            integrated = method(adata, batch, celltype)
        else:
            integrated = method(adata, batch)

    sc.write(outPath, integrated)
Exemple #2
0
    def test_non_unique_names(self, adata: AnnData, path: Path, lin_key: str, _: int):
        names_key = _lin_names(lin_key)
        adata.uns[names_key][0] = adata.uns[names_key][1]

        sc.write(path, adata)
        with pytest.raises(ValueError):
            _ = cr.read(path)
Exemple #3
0
    def test_no_lineage(self, adata: AnnData, path: Path, lin_key: str, _: int):
        del adata.obsm[lin_key]

        sc.write(path, adata)
        adata_new = cr.read(path)

        assert adata_new is not adata  # sanity check
        assert lin_key not in adata_new.obsm.keys()
def prepare_and_load_edge2shoe(file_path,
                               restore=True,
                               save=True,
                               img_width=64,
                               img_height=64,
                               verbose=True):
    data_path = os.path.dirname(file_path)
    if restore and os.path.exists(
            os.path.join(data_path,
                         f"edges2shoes_{img_width}x{img_height}.h5ad")):
        return sc.read(
            os.path.join(data_path,
                         f"edges2shoes_{img_width}x{img_height}.h5ad"))

    tar = tarfile.open(file_path)
    images, edges = [], []

    counter = 0
    for member in tar.getmembers():
        if member.name.endswith(".jpg"):
            f = tar.extractfile(member)
            image = Image.open(f)

            edge, image = image.crop((0, 0, 256, 256)), image.crop(
                (256, 0, 512, 256))

            edge = edge.resize((64, 64), Image.BICUBIC)
            image = image.resize((64, 64), Image.NEAREST)

            edge = np.array(edge)
            image = np.array(image)

            images.append(image)
            edges.append(edge)

            counter += 1
            if verbose and counter % 1000 == 0:
                print(counter)
    images = np.array(images)
    edges = np.array(edges)

    images = images.reshape(-1, np.prod(images.shape[1:]))
    edges = edges.reshape(-1, np.prod(edges.shape[1:]))

    data = np.concatenate([images, edges], axis=0)

    if save:
        data = anndata.AnnData(X=data)
        data.obs['id'] = np.concatenate(
            [np.arange(images.shape[0]),
             np.arange(images.shape[0])])
        data.obs['condition'] = ['shoe'] * images.shape[0] + [
            'edge'
        ] * images.shape[0]
        sc.write(filename=os.path.join(
            data_path, f"edges2shoes_{img_width}x{img_height}.h5ad"),
                 adata=data)
    return data
Exemple #5
0
def save_adata(adata, filepath, ext='.h5ad', gcs=False):
    if gcs:
        temp = NamedTemporaryFile(suffix=ext, delete=False)
        temp.close()
        sc.write(temp.name, adata)
        subprocess.call('gsutil -m cp %s %s' % (temp.name, filepath),
                        shell=True)
        subprocess.call('rm %s' % temp.name, shell=True)
    else:
        sc.write(filepath, adata)
Exemple #6
0
    def test_no_colors(self, adata: AnnData, path: Path, lin_key: str, n_lins: int):
        colors_key = _colors(lin_key)
        del adata.uns[colors_key]

        sc.write(path, adata)
        adata_new = cr.read(path)
        lins = adata_new.obsm[lin_key]

        assert isinstance(lins, Lineage)
        np.testing.assert_array_equal(lins.colors, _create_categorical_colors(n_lins))
        np.testing.assert_array_equal(lins.colors, adata_new.uns[colors_key])
def normalize_by_scanpy(adata,
                        adata_path_filenames,
                        exclude_highly_expressed=True,
                        raw=False):
    """
    Normalize counts per spot (cell for scRNA-seq) with scanpy function sc.pp.normalize_total().
    If target sum is 1e6 than CPM normalisation is applied
    By excluding highly expressed genes, the normalisation of lower expressed genes are higher weighted [Weinreb17].

    Parameters
    ----------
    adata : annData
    adata_path_filenames : str
    exclude_highly_expressed : bool
    raw : bool

    Returns
    -------
    adata : annData
        The count data has been normalized and log-transformed with an offset of 1.
        The offset of 1 ensures that zero counts map to zeros. Keep this data in '.raw' part of the AnnData object
        as it will be used to visualize gene expression and perform statistical tests such as computing marker genes
        for clusters

    """
    # Keep the count data in a counts layer
    adata.layers["counts"] = adata.X.copy()

    # return dictionary if inplace is False otherwise updates adata
    x_norm = sc.pp.normalize_total(
        adata,
        inplace=False,
        exclude_highly_expressed=exclude_highly_expressed,
        target_sum=1e6)['X']
    adata.X = x_norm

    # log-transforms and updates adata
    sc.pp.log1p(adata)

    # modify resulting matrix
    adata.X = np.asarray(adata.X)

    # Store the full data set in 'raw' as log-normalised data for statistical testing
    adata.raw = adata

    # save adata object
    if raw:
        sc.write('{}_raw_QC_sizefactors.h5'.format(adata_path_filenames),
                 adata=adata)
    else:
        sc.write('{}_QC_sizefactors.h5'.format(adata_path_filenames),
                 adata=adata)

    return adata
Exemple #8
0
    def test_normal_run(self, adata: AnnData, path: Path, lin_key: str, n_lins: int):
        colors = _create_categorical_colors(10)[-n_lins:]
        names = [f"foo {i}" for i in range(n_lins)]

        adata.uns[_colors(lin_key)] = colors
        adata.uns[_lin_names(lin_key)] = names

        sc.write(path, adata)
        adata_new = cr.read(path)
        lins_new = adata_new.obsm[lin_key]

        np.testing.assert_array_equal(lins_new.colors, colors)
        np.testing.assert_array_equal(lins_new.names, names)
Exemple #9
0
    def test_no_names(self, adata: AnnData, path: Path, lin_key: str,
                      n_lins: int):
        names_key = _lin_names(lin_key)
        del adata.uns[names_key]

        sc.write(path, adata)
        adata_new = cr.read(path)
        lins = adata_new.obsm[lin_key]

        assert isinstance(lins, Lineage)
        np.testing.assert_array_equal(lins.names,
                                      [f"Lineage {i}" for i in range(n_lins)])
        np.testing.assert_array_equal(lins.names, adata_new.uns[names_key])
Exemple #10
0
    def test_wrong_names_length(self, adata: AnnData, path: Path, lin_key: str,
                                n_lins: int):
        names_key = _lin_names(lin_key)
        adata.uns[names_key] = list(adata.uns[names_key])
        adata.uns[names_key] += ["foo", "bar", "baz"]

        sc.write(path, adata)
        adata_new = cr.read(path)
        lins = adata_new.obsm[lin_key]

        assert isinstance(lins, Lineage)
        np.testing.assert_array_equal(lins.names,
                                      [f"Lineage {i}" for i in range(n_lins)])
        np.testing.assert_array_equal(lins.names, adata_new.uns[names_key])
Exemple #11
0
    def test_writeable(self, adata: AnnData, interactions: Interactions_t,
                       tmpdir):
        ligrec(adata,
               _CK,
               interactions=interactions,
               n_perms=5,
               copy=False,
               show_progress_bar=False,
               key_added="foo")
        res = adata.uns["foo"]

        sc.write(tmpdir / "ligrec.h5ad", adata)
        bdata = sc.read(tmpdir / "ligrec.h5ad")

        for key in ["means", "pvalues", "metadata"]:
            assert_frame_equal(res[key], bdata.uns["foo"][key])
Exemple #12
0
def runPP(inPath, outPath, hvg, batch, rout, scale, seurat):
    """
    params:
        inPath: path of the anndata object
        outPath: path of the preprocessed file to be written
        hvg: number of highly variable genes to use
        rout: set to true to save a Seurat object
        scale: set to true to activate scaling
        seurat: set to true to produce hvg list
    """

    adata = sc.read(inPath)
    hvgs = adata.var.index

    # remove HVG if already precomputed
    if 'highly_variable' in adata.var:
        del adata.var['highly_variable']

    if hvg > 500:
        print("Computing HVGs ...")
        if seurat:
            hvgs = scib.preprocessing.hvg_batch(
                adata,
                batch_key=batch,
                target_genes=hvg,
                adataOut=False
            )
        else:
            adata = scib.preprocessing.hvg_batch(
                adata,
                batch_key=batch,
                target_genes=hvg,
                adataOut=True
            )
    if scale:
        print("Scaling data ...")
        adata = scib.preprocessing.scale_batch(adata, batch)

    if rout:
        print("Save as RDS")
        scib.preprocessing.saveSeurat(adata, outPath, batch, hvgs)

    else:
        print("Save as HDF5")
        sc.write(outPath, adata)
def normalize_by_sizefactor(adata, adata_path_filenames):
    """Normalising the count matrix using sizefactors
    The basic preprocessing includes assuming all size factors are equal
    (library size normalization to counts per million - CPM) and log-transforming the count data

    Parameters
    ----------
    adata : annData
    adata_path_filenames : str

    Returns
    -------
    adata : annData
        The count data has been normalized and log-transformed with an offset of 1.
        The offset of 1 ensures that zero counts map to zeros. Keep this data in '.raw' part of the AnnData object
        as it will be used to visualize gene expression and perform statistical tests such as computing marker genes
        for clusters

    """

    # Keep the count data in a counts layer
    adata.layers["counts"] = adata.X.copy()

    # Normalize adata
    adata.X /= adata.obs['size_factors'].values[:, None]

    # log-transforms and updates adata
    # and log or Box-Cox transformation (lambda=0)
    # because count distribution follows a power-law and afterwards a normal distribution -> easier to apply stat-tests
    sc.pp.log1p(adata)

    # modify resulting matrix
    adata.X = np.asarray(adata.X)

    # Store the full data set in 'raw' as log-normalised data for statistical testing
    adata.raw = adata

    # save adata object
    sc.write('{}_QC_sizefactors.h5'.format(adata_path_filenames), adata=adata)

    return adata
Exemple #14
0
def read_files(filename, sample):
    '''import 10X data, based on filename (path to file) and sample ID (assigned as unique ID)'''

    path = '%s/'%(filename)
    adata = sc.read(path+'matrix.mtx',cache=True).transpose()
  
    try: 
        adata.var_names = np.genfromtxt(path + 'genes.tsv',dtype=str)[:,1]
        adata.var['GeneName'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 1]
        adata.var['GeneID'] = np.genfromtxt(path + 'genes.tsv', dtype=str)[:, 0]
    except: 
        adata.var_names = np.genfromtxt(path + 'features.tsv.gz',dtype=str)[:,1]
        adata.var['GeneName'] = np.genfromtxt(path + 'features.tsv.gz', dtype=str)[:, 1]
        adata.var['GeneID'] = np.genfromtxt(path + 'features.tsv.gz', dtype=str)[:, 0]
    adata.obs_names = np.genfromtxt(path + 'barcodes.tsv',dtype=str)
    adata.obs_names = [filename+"-"+x.strip("-1") for x in adata.obs_names]
    adata.obs['Sample'] = sample
    
    # caculate n_counts / n_genes per cell
    adata.obs['n_counts'] = np.sum(adata.X, axis=1).A1
    adata.obs['n_genes'] = np.sum(adata.X>0,axis=1)
    
    mito_genes = adata.var_names.str.startswith('MT-')
    adata.obs['mito'] = (np.sum(adata.X[:, mito_genes],axis=1).A1) / (np.sum(adata.X,axis=1).A1)
    
    # filter cells
    clist = []
    clist.append(np.array(adata.obs['n_counts'] > 1000))
    clist.append(np.array(adata.obs['n_genes'] > 500))
    clist.append(np.array(adata.obs['n_genes'] < 7000))
    clist.append(np.array(adata.obs['mito'] < 0.5))

    c = np.column_stack(clist).all(axis=1)
    adata = adata[c].copy()

    sc.write('%s%s_filtered'%(version,sample),adata)
    
    return adata
Exemple #15
0
def _create_dummy_adata(n_obs: int) -> AnnData:
    """
    Create a testing :class:`anndata.AnnData` object.

    Call this function to regenerate the above objects.

    Params
    ------
    n_obs
        Number of cells.

    Returns
    -------
    :class:`anndata.AnnData`
        The created adata object.
    """

    np.random.seed(42)
    adata = scv.datasets.toy_data(n_obs=n_obs)
    scv.pp.filter_and_normalize(adata, min_shared_counts=20, n_top_genes=1000)
    adata.raw = adata[:, 42:42 + 50].copy()
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    scv.tl.recover_dynamics(adata)
    scv.tl.velocity(adata, mode="dynamical")
    scv.tl.velocity_graph(adata, mode_neighbors="connectivities")
    scv.tl.latent_time(adata)

    adata.uns["iroot"] = 0
    sc.tl.dpt(adata)

    adata.uns["connectivity_variances"] = np.ones((n_obs, n_obs),
                                                  dtype=np.float64)
    adata.uns["velocity_variances"] = np.ones((n_obs, n_obs), dtype=np.float64)

    sc.write(f"tests/_ground_truth_adatas/adata_{n_obs}.h5ad", adata)

    return adata
def main(configs, adata, save_folder):
    """Control which pre-processing steps shall be applied before downstream analysis, DGE analysis
    and pathway enrichment analysis

    Parameters
    ----------
    configs : configparser
        contains all parameters -> to add: thresholds and cut parameters
    adata : annData
    save_folder : str

    Returns
    -------
    norm_pp_adata : annData
    adata_filename : str

    """

    print("\n-------- Overview of data sets --------")
    if configs['data']['data_type'] == 'Spatial Transcriptomics':
        # 1.1 Add meta data like which samples belong to which donor (since 04.10.2020)
        adata, tissue_cell_labels, disease_labels, lesion_labels = add_metadata(
            adata)
        # 1.2 Remove spots having no tissue/cell labels (since 06.10.2020)
        adata = adata[np.where(
            adata.obs[tissue_cell_labels].to_numpy().any(axis=1))[0]]

        dataset_type = "st"
    else:
        dataset_type = "sc"

    # print info about sample 1
    sample_name = adata.obs['sample'].values[1]
    print("\nSample {} ".format(sample_name))
    print("Shape of filtered data set: ", adata.shape)
    print("Tissue associated No. of spots: ", adata.shape[0])
    print("Tissue associated No. of genes: ", adata.shape[1])
    print("Observables contained in data sets sorted by barcodes: ",
          adata.obs_keys())
    print("Variables contained in data sets sorted by gene names: ",
          adata.var_keys())

    if configs['data']['data_type'] == 'Spatial Transcriptomics':
        # plot spots on top of images (only for the first sample)
        plot_images(configs=configs, adata=adata, save_folder=save_folder)

    # 2. Pre-processing and visualization
    # apply the following steps 2.1 - 2.6 individually on each adata object
    print("\n-------- Start Pre-processing and Visualization --------")

    # 2.0
    # show 20thst highest expressed genes (HEG) in data set and per sample
    determine_heg(adata=adata, save_folder=save_folder)

    print("\n         Quality Control\n")
    # 2.1 QC (Quality control) of data - calculate QC covariates
    # 2.1.1 Cell QC
    # TODO Determine counts_threshold via Mean absolute deviation (MAD); find outliers :)
    adata_qc = sample_qc(adata=adata,
                         save_folder=save_folder,
                         counts_threshold=60000,
                         lower_filter_counts=2000,
                         upper_filter_counts=2500,
                         upper_filter_genes=2000,
                         log_scale=False,
                         raw=configs.getboolean("preprocessing",
                                                "read_raw_matrix"),
                         sample_name=sample_name)

    # 2.1.2 Threshold determination of UMI counts and genes
    cutted_adata, min_genes, min_shared_counts, mt_threshold, min_counts, max_counts, min_umi_genes, max_umi_genes = \
        apply_qc_filter(adata=adata_qc,
                        apply_mt_threshold=configs.getboolean("preprocessing", 'apply_mt_threshold'))

    if configs.getboolean('preprocessing', 'filter_doublets'):
        # 2.1.3 Filter out multiplets --
        cutted_adata = doublet_detection.scrublet_algorithm(
            cutted_adata, sample_name=sample_name, save_folder=save_folder)

    # save QC adata object
    adata_filename = '{}_QC.h5'.format(dataset_type)
    sc.write(os.path.join(configs["data"]['output_path'], adata_filename),
             cutted_adata)

    # 2.2 Normalization
    print("\n         Normalization\n")
    norm_adata = apply_normalisation(
        adata=cutted_adata,
        save_folder=save_folder,
        norm_type=configs['preprocessing']['normalisation_function'],
        exclude_highly_expressed=configs.getboolean(
            "preprocessing", "exclude_highly_expressed"),
        raw=configs_file.getboolean("preprocessing", "read_raw_matrix"),
        adata_path_filenames=configs["data"]['output_path'])
    # save QC and normed adata object
    adata_filename = '{}_QC_normed.h5'.format(dataset_type)
    sc.write(os.path.join(configs["data"]['output_path'], adata_filename),
             norm_adata)
    # TODO plot normalised count data distribution

    # -------------------------------------------------- Optional ---------------------------------------------------- #
    # 2.2.1 Scale data
    if configs.getboolean("preprocessing", "apply_scaling"):
        norm_adata = scaling_and_regression.scaling(adata=norm_adata)

    # 2.2.2 Regress out uninteresting variation
    """ATTENTION: Regressing out biological covariates is generally done to isolate particular processes in 
                  the data that you are interested in, while losing global structure in the data.
    Cell cycle stage can be a major determinant in the difference between two cell types (e.g. stem cells and 
    proliferating cells like transit amplifying cells). Removing this effect, hides the distinction"""
    if configs.getboolean("preprocessing", "apply_remove_cc_effect"):
        norm_adata = scaling_and_regression.apply_regression_variables(
            adata=norm_adata)
    # ---------------------------------------------------------------------------------------------------------------- #

    print("\n         Visualisation\n")
    # 2.3.1 Highly Variable Genes (HVG) for feature selection
    # HVG: highly expressed in some cells and lowly expressed in others
    norm_adata = highly_variable_genes.find_highly_variable_genes(
        norm_adata,
        type_dataset="uncorrected",
        save_folder=save_folder,
        num_top_genes=4000,
        raw=configs.getboolean("preprocessing", "read_raw_matrix"))

    # 2.3.2 Determine No. PCs
    pc_determination.pcs_combs(norm_adata,
                               save_folder,
                               use_highly_variable=True,
                               copy=False,
                               return_info=False,
                               raw=configs.getboolean("preprocessing",
                                                      "read_raw_matrix"),
                               type_dataset="uncorrected")

    # 2.3.3 Visualization
    try:
        n_comps = int(
            input(
                "Please provide the No. principal components (default 50): "))
    except ValueError:
        n_comps = int(50)
    norm_adata = calculate_visualizations.calc_visualization(
        norm_adata,
        save_folder=save_folder,
        raw=configs.getboolean("preprocessing", "read_raw_matrix"),
        n_comps=n_comps,
        batch_key="uncorrected")

    if configs.getboolean("preprocessing", "get_cc_effect"):
        print("\n         Cell Cycle\n")
        # 2.4 Cell cycle scoring
        # todo find another .csv file with cell cycle phases
        norm_adata = cell_cycle_storing.score_cell_cycle(
            cc_genes_file=configs['input_files']['cell_cycle'],
            adata=norm_adata,
            save_folder=save_folder,
            raw=configs.getboolean("preprocessing", "read_raw_matrix"))

    # 2.5 Apply Batch correction if samples are from same (or different) data set but splitted into batches
    # Dr. Maren Buettner:
    # "During the QC step, we observed differences across samples for instance, in the library size per dataset.
    # Such differences may contribute to the batch effect."
    if configs.getboolean("preprocessing", "sample_concat"):
        print("\n         Batch Correction\n")
        norm_bc_adata = batch_correction.apply_batch_correction(
            norm_adata,
            save_folder=save_folder,
            n_comps=n_comps,
            batch_key='sample',
            possible_batch_effects=[
                'project', 'phase', 'patient', 'disease', 'biopsy_type'
            ])

        # 2.5.1 Run find highly variable genes again on integrated dataset
        # HVG: highly expressed in some cells and lowly expressed in others
        norm_pp_adata = highly_variable_genes.find_highly_variable_genes(
            norm_bc_adata,
            type_dataset="batch_corrected",
            save_folder=save_folder,
            num_top_genes=4000,
            raw=configs.getboolean("preprocessing", "read_raw_matrix"))
        # Actually already calculated in Batch correction functions..
        # 2.5.2 Determine No. PCs
        pc_determination.pcs_combs(norm_pp_adata,
                                   save_folder,
                                   type_dataset="batch_corrected",
                                   use_highly_variable=True,
                                   copy=False,
                                   return_info=False,
                                   raw=configs.getboolean(
                                       "preprocessing", "read_raw_matrix"))

        # 2.5.3 Visualisation
        n_comps = int(
            input(
                "Please provide the No. principal components (default 50): "))
        norm_pp_adata = calculate_visualizations.calc_visualization(
            norm_pp_adata,
            save_folder=save_folder,
            raw=configs.getboolean("preprocessing", "read_raw_matrix"),
            n_comps=n_comps,
            batch_key="batch_corrected")

    else:
        norm_pp_adata = norm_adata.copy()

    adata_filename = '{}_QC_normed_BC.h5'.format(dataset_type, )
    sc.write(os.path.join(configs["data"]['output_path'], adata_filename),
             norm_pp_adata)

    plots_preprocessing.plot_visualization_results(adata=norm_pp_adata,
                                                   save_folder=save_folder,
                                                   batch_key="batch_corrected",
                                                   raw=configs.getboolean(
                                                       "preprocessing",
                                                       "read_raw_matrix"))

    print("-------- Finished: Pre-processing and Visualization --------")

    print("Start storing pre-processed AnnData object")
    # 2.7 save pre-processed annData object
    # # transform float e.g. 0.25 -> 0_25
    mt_cut_splitted = str(mt_threshold).split(".")
    mt_cut = mt_cut_splitted[0] + str("_") + mt_cut_splitted[1]

    if max_umi_genes == 0:
        # save pre-processed annData object
        filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig_{}'.format(
            dataset_type, min_counts, max_counts, min_genes, min_shared_counts,
            mt_cut, min_umi_genes)
    else:
        # save pre-processed annData object
        filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig{}_maxumig_{}'.format(
            dataset_type, min_counts, max_counts, min_genes, min_shared_counts,
            mt_cut, min_umi_genes, max_umi_genes)

    adata_filename = '{}_pp.h5'.format(filter_name)
    sc.write(os.path.join(configs["data"]['output_path'], adata_filename),
             norm_pp_adata)

    return norm_pp_adata, adata_filename
Exemple #17
0
            if args.densify:
                input_counts = sc.AnnData(X=input_counts.values,
                                       obs=pd.DataFrame(index=input_counts.index),
                                       var=pd.DataFrame(index=input_counts.columns))
            else:
                input_counts = sc.AnnData(X=sp.csr_matrix(input_counts.values),
                                       obs=pd.DataFrame(index=input_counts.index),
                                       var=pd.DataFrame(index=input_counts.columns))

                
        if sp.issparse(input_counts.X) & args.densify:
            input_counts.X = np.array(input_counts.X.todense())
 
        if args.tpm is None:
            tpm = compute_tpm(input_counts)
            sc.write(cnmf_obj.paths['tpm'], tpm)
        elif args.tpm.endswith('.h5ad'):
            subprocess.call('cp %s %s' % (args.tpm, cnmf_obj.paths['tpm']), shell=True)
            tpm = sc.read(cnmf_obj.paths['tpm'])
        else:
            if args.tpm.endswith('.npz'):
                tpm = load_df_from_npz(args.tpm)
            else:
                tpm = pd.read_csv(args.tpm, sep='\t', index_col=0)
            
            if args.densify:
                tpm = sc.AnnData(X=tpm.values,
                            obs=pd.DataFrame(index=tpm.index),
                            var=pd.DataFrame(index=tpm.columns)) 
            else:
                tpm = sc.AnnData(X=sp.csr_matrix(tpm.values),
    k = cr.tl.transition_matrix(
        adata,
        weight_connectivities=0.2,
        mode="stochastic",
        n_jobs=n_jobs,
        softmax_scale=None,
        show_progress_bar=False,
    )

    g = cr.tl.estimators.GPCCA(k)
    g.compute_schur(20)
    g.compute_macrostates(9, cluster_key="clusters")
    g.set_terminal_states_from_macrostates(
        ["Alpha", "Beta", "Epsilon", "Delta"])

    sc.write(ROOT / "adata_preprocessed.h5ad", adata)
    g.terminal_states.to_csv(ROOT / "terminal_states.csv")
    terminal_states = g.terminal_states


def compute_abs_probs(
    ixs: np.ndarray,
    adata: AnnData,
    terminal_states: pd.Series,
    c: cr.tl.kernels.ConnectivityKernel,
):
    res = []

    for i in ixs:
        try:
            conn = c.copy()
def prepare_and_load_celeba(file_path,
                            attr_path,
                            landmark_path,
                            gender='Male',
                            attribute='Smiling',
                            max_n_images=None,
                            restore=True,
                            save=True,
                            img_width=64,
                            img_height=78,
                            verbose=True):
    data_path = os.path.dirname(file_path)
    zip_filename = os.path.basename(file_path).split(".")[0]
    if restore and os.path.exists(
            os.path.join(
                data_path,
                f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad"
            )):
        return sc.read(
            os.path.join(
                data_path,
                f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad"
            ))

    def load_attr_list(file_path):
        indices = []
        attributes = []
        with open(file_path) as f:
            lines = f.read().splitlines()
            columns = lines[1].split(" ")
            columns.remove('')
            for i in range(2, len(lines)):
                elements = lines[i].split()
                indices.append(elements[0])
                attributes.append(list(map(int, elements[1:])))
        attr_df = pd.DataFrame(attributes)
        attr_df.index = indices
        attr_df.columns = columns
        if verbose:
            print(attr_df.shape[0])
        return attr_df

    def load_landmark_list(file_path):
        indices = []
        landmarks = []
        with open(file_path) as f:
            lines = f.read().splitlines()
            columns = lines[1].split(" ")
            for i in range(2, len(lines)):
                elements = lines[i].split()
                indices.append(elements[0])
                landmarks.append(list(map(int, elements[1:])))
        landmarks_df = pd.DataFrame(landmarks)
        landmarks_df.index = indices
        landmarks_df.columns = columns
        print(landmarks_df.shape[0])
        return landmarks_df

    images = []
    zfile = zipfile.ZipFile(file_path)
    counter = 0
    attr_df = load_attr_list(attr_path)
    landmarks = load_landmark_list(landmark_path)
    landmarks = landmarks[abs(landmarks['lefteye_x'] -
                              landmarks['righteye_x']) > 30]
    landmarks = landmarks[abs(landmarks['lefteye_x'] -
                              landmarks['nose_x']) > 15]
    landmarks = landmarks[abs(landmarks['righteye_x'] -
                              landmarks['nose_x']) > 15]
    landmarks.head()
    attr_df = attr_df.loc[landmarks.index]
    print("# of images after preprocessing: ", attr_df.shape[0])

    indices = []
    for filename in attr_df.index.tolist():
        ifile = zfile.open(os.path.join(f"{zip_filename}/", filename))
        image = Image.open(ifile)
        image_landmarks = landmarks.loc[filename]
        most_left_x = max(
            0,
            min(image_landmarks['lefteye_x'], image_landmarks['leftmouth_x']) -
            15)
        most_right_x = min(
            178,
            min(image_landmarks['righteye_x'], image_landmarks['rightmouth_x'])
            + 15)

        most_up_y = max(0, image_landmarks['lefteye_y'] - 35)
        most_down_y = min(218, image_landmarks['rightmouth_y'] + 25)

        image_cropped = image.crop(
            (most_left_x, most_up_y, most_right_x, most_down_y))
        image_cropped = image_cropped.resize((img_width, img_height),
                                             Image.NEAREST)
        image = image_cropped

        image = np.reshape(image, (img_width, img_height, 3))

        if max_n_images is None:
            images.append(image)
            indices.append(filename)
            counter += 1
            if verbose and counter % 1000 == 0:
                print(counter)
        else:
            if counter < max_n_images:
                images.append(image)
                indices.append(filename)
                counter += 1
                if verbose and counter % 1000 == 0:
                    print(counter)
            else:
                break
    images = np.array(images)
    if verbose:
        print(images.shape)
    images_df = pd.DataFrame(images.reshape(-1, np.prod(images.shape[1:])))
    images_df.index = indices

    if save:
        data = anndata.AnnData(X=images_df.values)
        attr_df = attr_df.loc[images_df.index]
        print(data.shape, attr_df.shape)
        data.obs['labels'] = attr_df[gender].values
        data.obs['condition'] = attr_df[attribute].values
        sc.write(filename=os.path.join(
            data_path,
            f"celeba_{attribute}_{img_width}x{img_height}_{max_n_images}.h5ad"
        ),
                 adata=data)
    return data
        fig, ax = plt.subplots(1, 1, figsize=(3, 3))
        groupby = "donor_sex" if "donor_sex" in a.obs.columns else "leiden"
        sc.pl.violin(a, groupby=groupby, keys="sex_ratio", ax=ax, show=False)
        ax.axhline(0, linestyle="--", color="grey")
        fig.savefig(
            output_prefix + ".single_cell.sex_ratio.svg",
            dpi=300,
            bbox_inches="tight",
        )

    # Make sure plotting order is random
    a = a[np.random.choice(a.obs.index.tolist(), a.obs.shape[0], replace=False
                           ), :]

    if not os.path.exists(output_prefix + ".filtered.h5ad"):
        sc.write(output_prefix + ".filtered.h5ad", a)

    a = sc.read(output_prefix + ".filtered.h5ad")

    sc.pl.pca_variance_ratio(a, log=True, show=False)
    plt.gca().figure.savefig(
        output_prefix + ".single_cell.pca_variance_ratio.svg",
        dpi=300,
        bbox_inches="tight",
    )

    fig = sc.pl.pca(
        a,
        color=tech_attributes + attributes,
        components=["1,2", "2,3", "3,4", "4,5"],
        return_fig=True,
    cbar_kws={"label": "log difference from normal"},
    rasterized=True)
clustermap_rasterize_heatmap(g)
clustermap_fix_label_orientation(g)
clustermap_rasterize_dendrogram(g)
savefig(g, os.path.join("results", prefix + ".dca_denoised-zinb.CNVs.whole_genome.grouped.svg"))

cnv = sc.AnnData(p)
cnv.obs = adata.obs
sc.pp.pca(cnv)
sc.pp.neighbors(cnv)
sc.tl.umap(cnv)

cnv.obs.to_csv(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.obs.csv")
cnv.obs = pd.DataFrame(index=cnv.obs.index)
sc.write(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.h5ad", cnv)

prefix = "cll-time_course-scRNA-seq.all_samples.250-50_filter"
cnv = sc.read(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.h5ad")
cnv.obs = pd.read_csv(prefix + ".dca_denoised-zinb.batch_combat.processed.cnv.processed.obs.csv", index_col=0)

c = (
    pd.DataFrame(cnv.X, index=cnv.obs.index, columns=cnv.var.index)
    .T
    .join(gene_order.set_index("gene")))

c = natsort_dataframe(c, ['chr', 'start'])

c_cll = c.loc[:, cnv.obs.loc[cnv.obs['cell_type'] == "CLL", :].index]

chosen_cells = list()
def save_adata(adata, suffix="", subdir=""):
    filename = f"{adata.uns['sampleid']}{'-' + suffix if suffix else ''}-{timestamp()}.h5ad"
    sc.write(Path(adata.uns["output_dir"]) / subdir / filename, adata)
Exemple #23
0
 def save_norm_counts(self, norm_counts):
     self._initialize_dirs()
     sc.write(self.paths['normalized_counts'], norm_counts)
Exemple #24
0
def plot_spring(adata,
                smp=None,
                names=None,
                comps='1,2',
                cont=None,
                layout='2d',
                legendloc='right margin',
                cmap=None,
                pal=None,
                right_margin=None,
                size=3):
    """
    Scatter plots.

    Parameters
    ----------
    adata : AnnData
        Annotated data matrix.
    smp : str, optional (default: first annotation)
        Sample/Cell annotation for coloring in the form "ann1,ann2,...". String
        annotation is plotted assuming categorical annotation, float and integer
        annotation is plotted assuming continuous annoation. Option 'cont'
        allows to switch between these default choices.
    names : str, optional (default: all names in smp)
        Allows to restrict groups in sample annotation (smp) to a few.
    comps : str, optional (default: '1,2')
         String in the form '1,2,3'.
    cont : bool, None (default: None)
        Switch on continuous layout, switch off categorical layout.
    layout : {'2d', '3d', 'unfolded 3d'}, optional (default: '2d')
         Layout of plot.
    legendloc : see matplotlib.legend, optional (default: 'lower right')
         Options for keyword argument 'loc'.
    cmap : str (default: 'viridis')
         String denoting matplotlib color map.
    pal : list of str (default: matplotlib.rcParams['axes.prop_cycle'].by_key()['color'])
         Colors cycle to use for categorical groups.
    right_margin : float (default: 0.2)
         Adjust how far the plotting panel extends to the right.
    size : float (default: 3)
         Point size.
    """
    Y = adata['X_spring']
    if True:
        #         sett.m(0, 'set parameter add_steps > 0 to iterate. '
        #                'the current step is', dspring['istep'],
        #                '\n--> append, for example, "--plotparams add_steps 1", for a single step')
        from .. import plotting as plott
        smps = plott.scatter(
            adata,
            basis='spring',
            smp=smp,
            names=names,
            comps=comps,
            cont=cont,
            layout=layout,
            legendloc=legendloc,
            cmap=cmap,
            pal=pal,
            right_margin=right_margin,
            size=size,
            # defined in plotting
            titles=['Fruchterman-Reingold step: 12'])
        writekey = sett.basekey + '_spring'
        writekey += '_' + ('-'.join(smps)
                           if smps[0] is not None else '') + sett.plotsuffix
        plott.savefig(writekey)
        if not sett.savefigs and sett.autoshow:
            from ..compat.matplotlib import pyplot as pl
            pl.show()
    else:
        Adj = dspring['Adj']
        istep = dspring['istep']
        # TODO: don't save the adjacency matrix!!!
        import scanpy as sc
        sc.write(dspring['writekey'] + '_step{:02}'.format(istep), dspring)
        # compute the next steps
        istep_init = istep + 1
        add_steps = params['add_steps']
        del params['add_steps']
        for istep in istep_init + np.arange(add_steps, dtype=int):
            sett.mt(0, 'compute Fruchterman-Reingold layout: step', istep)
            Y = fruchterman_reingold_layout(Adj, Yinit=Y, iterations=step_size)
            sett.mt(0, 'finished computation')
            _plot({'Y': Y}, adata, istep, **params)
        # save state of Y to outfile
        dspring['Y'] = Y
        dspring['istep'] = istep
        sc.write(dspring['writekey'], dspring)
            cum = df[col]
        else:
            cum += df[col]
    fig.savefig(
        prefix + f"patient_{pat}.global_projection.stacked_bar_by_{var}.svg",
        **figkws,
    )

    # Compare with newly designed space
    sc.pp.combat(p, "processing_batch_categorical")
    sc.pp.pca(p)
    sc.pp.neighbors(p, n_neighbors=50)
    sc.tl.umap(p)
    sc.tl.leiden(p, resolution=0.5, key_added="cluster")

    sc.write(prefix + f"{pat}.own_projection.processed.h5ad", p)

    n_cols = max(n_cols, p.shape[1]) + 1

    fig, axes = plt.subplots(2, n_cols, figsize=(n_cols * 4, 2 * 4))
    for i, ch in enumerate(p.var.index):
        sc.pl.umap(p, color=[ch], ax=axes[0, i], show=False)
    k = dict(show=False, size=4)
    sc.pl.umap(p, color=["cluster"], cmap="rainbow", ax=axes[1, 0], **k)
    sc.pl.umap(p, color=["time_symptoms"], cmap="rainbow", ax=axes[1, 1], **k)
    for i, time in enumerate(times, 2):
        p.obs["plot"] = (p.obs["time_symptoms"] == time).astype(float)
        print(p.obs["plot"].sum())
        sc.pl.umap(
            p, color=["plot"], cmap="Reds", ax=axes[1, i], vmin=-0.25, **k
        )
        # save pre-processed annData object
        filter_name = '{}_minumi_{}_maxumi_{}_mg_{}_msc_{}_mt_{}_minumig{}_maxumig_{}'.format(
            dataset_type, min_counts, max_counts, min_genes, min_shared_counts,
            mt_cut, min_umi_genes, max_umi_genes)

    adata_filename = '{}_pp.h5'.format(filter_name)
    sc.write(os.path.join(configs["data"]['output_path'], adata_filename),
             norm_pp_adata)

    return norm_pp_adata, adata_filename


if __name__ == '__main__':
    output_path = os.path.join("..", "..", "output", str(date.today()))
    os.makedirs(output_path, exist_ok=True)
    adata_savepath = init_variables.init_vars()
    configs_file = ht.load_config(config_path=adata_savepath)

    # 1. Load data
    print("#   --  >Load data and information<  --   #")
    _, unpp_filtered_adata, _, _ = load_dataset(configs=configs_file)
    # save adata
    unppadata_filename = '{}_unpp.h5'.format(configs_file['data']['data_type'])
    sc.write(os.path.join(adata_savepath, unppadata_filename),
             unpp_filtered_adata)

    print("-------- Finished: Read out values --------")
    pp_adata, filename_adata = main(configs=configs_file,
                                    adata=unpp_filtered_adata,
                                    save_folder=output_path)
Exemple #27
0
def run_NMF(sample_name, matrix_10X, threads, K_range, K_selection,
            density_threshold, iteration, run_K):
    if not run_K:
        logging.info("reading matrix")
        adata = sc.read_10x_mtx(matrix_10X,
                                var_names='gene_symbols',
                                cache=False)
        adata.var_names_make_unique()
        outdir = "NMF_out/"
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        count_adat_fn = outdir + sample_name + '.h5ad'
        logging.info("writing h5 file")
        sc.write(count_adat_fn, adata)

        numiter = iteration  # Number of NMF replicates. Set this to a larger value ~200 for real data. We set this to a relatively low value here for illustration at a faster speed
        numhvgenes = 2000  ## Number of over-dispersed genes to use for running the actual factorizations
        ## Results will be saved to [output_directory]/[run_name] which in this example is example_PBMC/cNMF/pbmc_cNMF
        seed = 0  ## Specify a seed pseudorandom number generation for reproducibility

        numworkers = threads
        prepare_cmd = """python /SGRNJ/Database/script/soft/cNMF/cnmf.py \
        prepare --output-dir %s --name %s -c %s -k %s --n-iter %d \
        --total-workers %d --seed %d --numgenes %d --beta-loss frobenius""" % (
            outdir, sample_name, count_adat_fn, K_range, numiter, numworkers,
            seed, numhvgenes)
        logging.info(
            'Prepare command assuming parallelization with %d cores:\n%s' %
            (numworkers, prepare_cmd))
        os.system(prepare_cmd)

        ## Using GNU parallel
        worker_index = ' '.join([str(x) for x in range(numworkers)])
        factorize_cmd = """parallel python /SGRNJ/Database/script/soft/cNMF/cnmf.py \
            factorize --output-dir %s --name %s --worker-index {} ::: %s""" % (
            outdir, sample_name, worker_index)
        logging.info(
            'Factorize command to simultaneously run factorization over %d cores using GNU parallel:\n%s'
            % (numworkers, factorize_cmd))
        os.system(factorize_cmd)

        # combine
        combine_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \
            combine --output-dir %s --name %s' % (outdir, sample_name)
        logging.info(combine_cmd)
        os.system(combine_cmd)

        worker_index = ' '.join([str(x) for x in range(numworkers)])
        kselect_plot_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \
            k_selection_plot --output-dir %s --name %s' % (outdir, sample_name)
        logging.info('K selection plot command: %s' % kselect_plot_cmd)
        os.system(kselect_plot_cmd)

    # run_K
    consensus_cmd = 'python /SGRNJ/Database/script/soft/cNMF/cnmf.py \
        consensus --output-dir %s --name %s --local-density-threshold %.2f \
        --components %d --show-clustering' % (outdir, sample_name,
                                              density_threshold, K_selection)
    logging.info('Consensus command for K=%d:\n%s' %
                 (K_selection, consensus_cmd))
    os.system(consensus_cmd)

    ## Load the Z-scored GEPs which reflect how enriched a gene is in each GEP relative to all of the others
    density_threshold_str = ('%.2f' % density_threshold).replace('.', '_')
    gene_file = '{outdir}/{sample_name}/{sample_name}.gene_spectra_score.k_{K_selection}.dt_{density_threshold_str}.txt'.format(
        outdir=outdir,
        sample_name=sample_name,
        K_selection=K_selection,
        density_threshold_str=density_threshold_str)
    gene_scores = pd.read_csv(gene_file, sep='\t', index_col=0).T

    ## Obtain the top 100 genes for each GEP in sorted order and combine them into a single dataframe
    top_genes = []
    ngenes = 100
    for gep in gene_scores.columns:
        top_genes.append(
            list(
                gene_scores.sort_values(by=gep,
                                        ascending=False).index[:ngenes]))

    top_genes = pd.DataFrame(top_genes, index=gene_scores.columns).T
    top_genes_file = '{outdir}/{sample_name}_top100_genes.tsv'.format(
        outdir=outdir, sample_name=sample_name)
    top_genes.to_csv(top_genes_file, sep="\t")

    usage_file = '{outdir}/{sample_name}/{sample_name}.usages.k_{K_selection}.dt_{density_threshold_str}.consensus.txt'.format(
        outdir=outdir,
        sample_name=sample_name,
        K_selection=K_selection,
        density_threshold_str=density_threshold_str)
    logging.info("usage_file:" + usage_file)
    usage = pd.read_csv(usage_file, sep='\t', index_col=0)
    usage.columns = ['Usage_%s' % i for i in usage.columns]
    usage_norm = usage.div(usage.sum(axis=1), axis=0)
    usage_norm_file = '{outdir}/{sample_name}_usage_norm.tsv'.format(
        outdir=outdir, sample_name=sample_name)
    usage_norm.to_csv(usage_norm_file, sep="\t")
Exemple #28
0
def write(adata,version,name):
    '''write adata into [name]'''
    name = version + name
    sc.write(name,adata)
    print("_".join(name.split(".")) + " = '%s'"%name)
data_doublets = os.path.join(sc.settings.writedir, '..', 'doublets')
if not os.path.exists(data_doublets):
    os.makedirs(data_doublets)
for key in doublet_scores:
    np.savetxt(os.path.join(data_doublets, key + '_doublet_scores.txt'),
               doublet_scores[key])

doublet_scores_list = []
for key in doublet_scores:
    #print(key)
    doublet_scores_list += list(doublet_scores[key])

data.obs['doublet_scores'] = doublet_scores_list
len(doublet_scores_list)

predicted_doublets_mask = []
for key in predicted_doublets:
    #print(key)
    predicted_doublets_mask += list(predicted_doublets[key])
len(predicted_doublets_mask)

predicted_singletons_mask = [not i for i in predicted_doublets_mask]
data = data[np.array(predicted_singletons_mask), :].copy()

print('Removing %d cells due to doublet scoring' %
      (len(predicted_singletons_mask) - sum(predicted_singletons_mask)))

sc.write('SLX19841_LD_filtered_gene_bc_expression_minus_putative_doublets',
         data)
data_doublets = os.path.join(sc.settings.writedir, 'doublets')

if not os.path.exists(data_doublets):
    os.makedirs(data_doublets)

for key in doublet_scores:
    np.savetxt(os.path.join(data_doublets, key + '_doublet_scores.txt'),
               doublet_scores[key])

doublet_scores_list = []
for key in doublet_scores:
    doublet_scores_list += list(doublet_scores[key])

data.obs['doublet_scores'] = doublet_scores_list

# create the boolean mask to filter out predicted doublets
predicted_doublets_mask = []

for key in predicted_doublets:
    predicted_doublets_mask += list(predicted_doublets[key])

predicted_singletons_mask = [not i for i in predicted_doublets_mask]

data = data[np.array(predicted_singletons_mask), :].copy()
print('Removing %d cells due to doublet scoring' %
      (len(predicted_singletons_mask) - sum(predicted_singletons_mask)))

sc.write('SLX14831_12978_filtered_gene_bc_expression_minus_putative_doublets',
         data)