Example #1
0
def doubletdetection_py(X, boost_rate, n_components, n_top_var_genes, use_phenograph, n_iters, verbose, standard_scaling, \
                        p_thresh, voter_thresh):
    import doubletdetection
    if use_phenograph:
        phenograph_parameters = {"prune": True}
    clf = doubletdetection.BoostClassifier(boost_rate=boost_rate, n_components=int(n_components), n_top_var_genes=int(n_top_var_genes), use_phenograph=use_phenograph, \
                          phenograph_parameters=phenograph_parameters, n_iters=int(n_iters), verbose=verbose, standard_scaling=standard_scaling)
    # raw_counts is a cells by genes count matrix
    labels = clf.fit(X).predict(float(p_thresh), float(voter_thresh))
    return (labels)
def test_classifier():

    counts = np.random.poisson(size=(500, 100))

    # no phenograph
    clf = doubletdetection.BoostClassifier(n_iters=2,
                                           use_phenograph=False,
                                           standard_scaling=True)
    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
    clf.doublet_score()

    # with phenograph
    clf = doubletdetection.BoostClassifier(n_iters=2,
                                           use_phenograph=True,
                                           standard_scaling=True)
    clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
    clf.doublet_score()

    doubletdetection.plot.convergence(clf,
                                      show=False,
                                      p_thresh=1e-16,
                                      voter_thresh=0.5)
    doubletdetection.plot.threshold(clf, show=False, p_step=6)
Example #3
0
def doubletdetection_c(sample, inDir, outDir, ratio_df, out_df):
    print(sample, "start doubletdetection")

    raw_counts = doubletdetection.load_mtx(os.path.join(inDir, 'matrix.mtx'))
    # Remove columns with all 0s
    zero_genes = (np.sum(raw_counts, axis=0) == 0).A.ravel()
    raw_counts = raw_counts[:, ~zero_genes]

    clf = doubletdetection.BoostClassifier(n_iters=50,
                                           use_phenograph=False,
                                           standard_scaling=True)
    doublets = clf.fit(raw_counts).predict(p_thresh=1e-16, voter_thresh=0.5)

    f = doubletdetection.plot.convergence(
        clf,
        save=os.path.join(outDir,
                          sample + '_doubletdetection_convergence.pdf'),
        show=True,
        p_thresh=1e-16,
        voter_thresh=0.5)
    f2, umap_coords = doubletdetection.plot.umap_plot(
        raw_counts,
        doublets,
        random_state=1,
        save=os.path.join(outDir, sample + '_doubletdetection_UMAP.pdf'),
        show=True)
    f3 = doubletdetection.plot.threshold(
        clf,
        save=os.path.join(outDir, sample + '_doubletdetection_threshold.pdf'),
        show=True,
        p_step=6)

    ratio_df.loc['doubletdetection',
                 sample] = len(doublets[doublets > 0]) / len(doublets)

    out_df['doubletdetection_doublets'] = doublets
    out_df.loc[out_df[out_df['doubletdetection_doublets'] == 0].index,
               'doubletdetection_doublets'] = False
    out_df.loc[out_df[out_df['doubletdetection_doublets'] == 1].index,
               'doubletdetection_doublets'] = True
    print(sample, "Done doubletdetection")

    return ratio_df, out_df
Example #4
0
### Read in data ###
raw_counts = read10x.import_cellranger_mtx(args.counts_matrix)
barcodes_df = read10x.read_barcodes(args.barcodes)

print('Counts matrix shape: {} rows, {} columns'.format(
    raw_counts.shape[0], raw_counts.shape[1]))

# Remove columns with all 0s
zero_genes = (np.sum(raw_counts, axis=0) == 0).A.ravel()
raw_counts = raw_counts[:, ~zero_genes]
print(
    'Counts matrix shape after removing unexpressed genes: {} rows, {} columns'
    .format(raw_counts.shape[0], raw_counts.shape[1]))

clf = doubletdetection.BoostClassifier(n_iters=args.n_iterations,
                                       use_phenograph=pheno,
                                       standard_scaling=standard_scaling,
                                       verbose=True)
doublets = clf.fit(raw_counts).predict(p_thresh=args.p_thresh,
                                       voter_thresh=args.voter_thresh)

results = pd.Series(doublets, name="DoubletDetection_DropletType")
dataframe = pd.concat([barcodes_df, results], axis=1)
dataframe.DoubletDetection_DropletType = dataframe.DoubletDetection_DropletType.replace(
    1.0, "doublet")
dataframe.DoubletDetection_DropletType = dataframe.DoubletDetection_DropletType.replace(
    0.0, "singlet")

dataframe.to_csv(os.path.join(args.outdir, 'DoubletDetection_results.txt'),
                 sep="\t",
                 index=False)
Example #5
0
import os
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
# os.chdir("U:\\GitHub\\My_Code_Collection\\doubletdetec")

import pandas as pd
import numpy as np
import doubletdetection as dd

counts = pd.read_csv("input.txt").values
clf = dd.BoostClassifier(n_iters=2,
                         use_phenograph=False,
                         standard_scaling=True)
labels = clf.fit(counts.T).predict(p_thresh=1e-16, voter_thresh=0.5)
doublet_score = clf.doublet_score()
pd.DataFrame(doublet_score.mask).to_csv('output1.txt',
                                        index=False,
                                        header=False)
pd.DataFrame(doublet_score).to_csv('output2.txt', index=False, header=False)

# https://github.com/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb
    return


working_dir, sampleID, genomes = parse_arguments(sys.argv)

## Perform doublet detection for each sample sequencially
for genome in genomes.split(','):
    if os.path.isfile(working_dir + '/count/' + sampleID +
                      '/outs/filtered_feature_bc_matrix/matrix.mtx.gz'):
        matrix_path = working_dir + '/count/' + sampleID + '/outs/filtered_feature_bc_matrix/matrix.mtx.gz'
        # . Remove columns with all 0s
        raw_counts = doubletdetection.load_mtx(matrix_path)
        zero_genes = np.sum(raw_counts, axis=0) == 0
        raw_counts = raw_counts[:, ~zero_genes]

        clf = doubletdetection.BoostClassifier(n_iters=50)

        start = time.time()
        doublets = clf.fit(raw_counts).predict(p_thresh=1e-7, voter_thresh=0.8)
        end = time.time()
        #print('Time elapsed: {:.2f} seconds, {:.2f}sec/iteration, for {} iterations'.format(end-start, (end-start) / clf.n_iters, clf.n_iters))

        output_dir = working_dir + '/count/' + sampleID + '/outs/analysis/doubletdetection'
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        output_name_f0 = output_dir + '/' + sampleID + '_' + genome + '_doubletdetection_doublets_tmp.txt'
        output_doublets = open(
            output_dir + '/' + sampleID + '_' + genome +
            '_doubletdetection_doublets.txt', 'w')
        np.savetxt(output_name_f0, doublets, delimiter='\t')
Example #7
0
dataset.protein_names = dataset.protein_names[non_control_proteins]

# Make anndata object
adata = anndata.AnnData(dataset.X)
adata.var.index = dataset.gene_names
adata.var_names_make_unique()
adata.obs.index = dataset.barcodes
adata.obsm["protein_expression"] = dataset.protein_expression
adata.uns["protein_names"] = dataset.protein_names

# Filter doublets called by DoubletDetection
try:
    doublets = np.load("data/metadata/pbmc5kdoublets.npy")
except FileNotFoundError:
    clf = doubletdetection.BoostClassifier(n_iters=25,
                                           use_phenograph=True,
                                           verbose=True,
                                           standard_scaling=False)
    doublets = clf.fit(adata.X).predict(p_thresh=1e-7, voter_thresh=0.8) == 1
    print("{} doublet rate".format(np.sum(doublets) / adata.X.shape[0]))
    np.save("data/metadata/pbmc5kdoublets.npy", doublets)

adata = adata[~doublets.astype(np.bool)]

# Filter cells by min_genes
sc.pp.filter_cells(adata, min_genes=200)

# Filter cells by mitochondrial reads, n_genes, n_counts
mito_genes = adata.var_names.str.startswith("MT-")
adata.obs["percent_mito"] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum(
    adata.X, axis=1)
adata.obs["n_counts"] = adata.X.sum(axis=1)
Example #8
0
import numpy as np
import doubletdetection
import scanpy as sc
import matplotlib.pyplot as plt

matrix_path = sys.argv[1]
adata = sc.read_10x_h5(matrix_path)
adata.var_names_make_unique()

# remove "empty" genes
sc.pp.filter_genes(adata, min_cells=1)

clf = doubletdetection.BoostClassifier(n_iters=50,
                                       use_phenograph=False,
                                       standard_scaling=True)
doublets = clf.fit(adata.X).predict(p_thresh=1e-7, voter_thresh=0.8)
#doublet_score = clf.doublet_score()

db = doublets.nonzero()
f = open(sys.argv[2], "w")
for d in db[0]:
    f.write(str(d + 1) + "\n")
f.close()