def doubletdetection_py(X, boost_rate, n_components, n_top_var_genes, use_phenograph, n_iters, verbose, standard_scaling, \ p_thresh, voter_thresh): import doubletdetection if use_phenograph: phenograph_parameters = {"prune": True} clf = doubletdetection.BoostClassifier(boost_rate=boost_rate, n_components=int(n_components), n_top_var_genes=int(n_top_var_genes), use_phenograph=use_phenograph, \ phenograph_parameters=phenograph_parameters, n_iters=int(n_iters), verbose=verbose, standard_scaling=standard_scaling) # raw_counts is a cells by genes count matrix labels = clf.fit(X).predict(float(p_thresh), float(voter_thresh)) return (labels)
def test_classifier(): counts = np.random.poisson(size=(500, 100)) # no phenograph clf = doubletdetection.BoostClassifier(n_iters=2, use_phenograph=False, standard_scaling=True) clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5) clf.doublet_score() # with phenograph clf = doubletdetection.BoostClassifier(n_iters=2, use_phenograph=True, standard_scaling=True) clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5) clf.doublet_score() doubletdetection.plot.convergence(clf, show=False, p_thresh=1e-16, voter_thresh=0.5) doubletdetection.plot.threshold(clf, show=False, p_step=6)
def doubletdetection_c(sample, inDir, outDir, ratio_df, out_df): print(sample, "start doubletdetection") raw_counts = doubletdetection.load_mtx(os.path.join(inDir, 'matrix.mtx')) # Remove columns with all 0s zero_genes = (np.sum(raw_counts, axis=0) == 0).A.ravel() raw_counts = raw_counts[:, ~zero_genes] clf = doubletdetection.BoostClassifier(n_iters=50, use_phenograph=False, standard_scaling=True) doublets = clf.fit(raw_counts).predict(p_thresh=1e-16, voter_thresh=0.5) f = doubletdetection.plot.convergence( clf, save=os.path.join(outDir, sample + '_doubletdetection_convergence.pdf'), show=True, p_thresh=1e-16, voter_thresh=0.5) f2, umap_coords = doubletdetection.plot.umap_plot( raw_counts, doublets, random_state=1, save=os.path.join(outDir, sample + '_doubletdetection_UMAP.pdf'), show=True) f3 = doubletdetection.plot.threshold( clf, save=os.path.join(outDir, sample + '_doubletdetection_threshold.pdf'), show=True, p_step=6) ratio_df.loc['doubletdetection', sample] = len(doublets[doublets > 0]) / len(doublets) out_df['doubletdetection_doublets'] = doublets out_df.loc[out_df[out_df['doubletdetection_doublets'] == 0].index, 'doubletdetection_doublets'] = False out_df.loc[out_df[out_df['doubletdetection_doublets'] == 1].index, 'doubletdetection_doublets'] = True print(sample, "Done doubletdetection") return ratio_df, out_df
### Read in data ### raw_counts = read10x.import_cellranger_mtx(args.counts_matrix) barcodes_df = read10x.read_barcodes(args.barcodes) print('Counts matrix shape: {} rows, {} columns'.format( raw_counts.shape[0], raw_counts.shape[1])) # Remove columns with all 0s zero_genes = (np.sum(raw_counts, axis=0) == 0).A.ravel() raw_counts = raw_counts[:, ~zero_genes] print( 'Counts matrix shape after removing unexpressed genes: {} rows, {} columns' .format(raw_counts.shape[0], raw_counts.shape[1])) clf = doubletdetection.BoostClassifier(n_iters=args.n_iterations, use_phenograph=pheno, standard_scaling=standard_scaling, verbose=True) doublets = clf.fit(raw_counts).predict(p_thresh=args.p_thresh, voter_thresh=args.voter_thresh) results = pd.Series(doublets, name="DoubletDetection_DropletType") dataframe = pd.concat([barcodes_df, results], axis=1) dataframe.DoubletDetection_DropletType = dataframe.DoubletDetection_DropletType.replace( 1.0, "doublet") dataframe.DoubletDetection_DropletType = dataframe.DoubletDetection_DropletType.replace( 0.0, "singlet") dataframe.to_csv(os.path.join(args.outdir, 'DoubletDetection_results.txt'), sep="\t", index=False)
import os abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) # os.chdir("U:\\GitHub\\My_Code_Collection\\doubletdetec") import pandas as pd import numpy as np import doubletdetection as dd counts = pd.read_csv("input.txt").values clf = dd.BoostClassifier(n_iters=2, use_phenograph=False, standard_scaling=True) labels = clf.fit(counts.T).predict(p_thresh=1e-16, voter_thresh=0.5) doublet_score = clf.doublet_score() pd.DataFrame(doublet_score.mask).to_csv('output1.txt', index=False, header=False) pd.DataFrame(doublet_score).to_csv('output2.txt', index=False, header=False) # https://github.com/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb
return working_dir, sampleID, genomes = parse_arguments(sys.argv) ## Perform doublet detection for each sample sequencially for genome in genomes.split(','): if os.path.isfile(working_dir + '/count/' + sampleID + '/outs/filtered_feature_bc_matrix/matrix.mtx.gz'): matrix_path = working_dir + '/count/' + sampleID + '/outs/filtered_feature_bc_matrix/matrix.mtx.gz' # . Remove columns with all 0s raw_counts = doubletdetection.load_mtx(matrix_path) zero_genes = np.sum(raw_counts, axis=0) == 0 raw_counts = raw_counts[:, ~zero_genes] clf = doubletdetection.BoostClassifier(n_iters=50) start = time.time() doublets = clf.fit(raw_counts).predict(p_thresh=1e-7, voter_thresh=0.8) end = time.time() #print('Time elapsed: {:.2f} seconds, {:.2f}sec/iteration, for {} iterations'.format(end-start, (end-start) / clf.n_iters, clf.n_iters)) output_dir = working_dir + '/count/' + sampleID + '/outs/analysis/doubletdetection' if not os.path.isdir(output_dir): os.makedirs(output_dir) output_name_f0 = output_dir + '/' + sampleID + '_' + genome + '_doubletdetection_doublets_tmp.txt' output_doublets = open( output_dir + '/' + sampleID + '_' + genome + '_doubletdetection_doublets.txt', 'w') np.savetxt(output_name_f0, doublets, delimiter='\t')
dataset.protein_names = dataset.protein_names[non_control_proteins] # Make anndata object adata = anndata.AnnData(dataset.X) adata.var.index = dataset.gene_names adata.var_names_make_unique() adata.obs.index = dataset.barcodes adata.obsm["protein_expression"] = dataset.protein_expression adata.uns["protein_names"] = dataset.protein_names # Filter doublets called by DoubletDetection try: doublets = np.load("data/metadata/pbmc5kdoublets.npy") except FileNotFoundError: clf = doubletdetection.BoostClassifier(n_iters=25, use_phenograph=True, verbose=True, standard_scaling=False) doublets = clf.fit(adata.X).predict(p_thresh=1e-7, voter_thresh=0.8) == 1 print("{} doublet rate".format(np.sum(doublets) / adata.X.shape[0])) np.save("data/metadata/pbmc5kdoublets.npy", doublets) adata = adata[~doublets.astype(np.bool)] # Filter cells by min_genes sc.pp.filter_cells(adata, min_genes=200) # Filter cells by mitochondrial reads, n_genes, n_counts mito_genes = adata.var_names.str.startswith("MT-") adata.obs["percent_mito"] = np.sum(adata[:, mito_genes].X, axis=1) / np.sum( adata.X, axis=1) adata.obs["n_counts"] = adata.X.sum(axis=1)
import numpy as np import doubletdetection import scanpy as sc import matplotlib.pyplot as plt matrix_path = sys.argv[1] adata = sc.read_10x_h5(matrix_path) adata.var_names_make_unique() # remove "empty" genes sc.pp.filter_genes(adata, min_cells=1) clf = doubletdetection.BoostClassifier(n_iters=50, use_phenograph=False, standard_scaling=True) doublets = clf.fit(adata.X).predict(p_thresh=1e-7, voter_thresh=0.8) #doublet_score = clf.doublet_score() db = doublets.nonzero() f = open(sys.argv[2], "w") for d in db[0]: f.write(str(d + 1) + "\n") f.close()