def scrublet_predictions(self, vlm, input_dir, doublet_rate=0.06): import scrublet as scr import scipy.io print('Loading counts matrix {}/matrix.mtx'.format(input_dir), file=sys.stderr) counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx').T.tocsc() print("Loading barcodes {}/barcodes.tsv".format(input_dir), file=sys.stderr) barcodes = np.array( scr.load_genes(input_dir + 'barcodes.tsv', delimiter='t', column=0)) print("Initializing scrublet object", file=sys.stderr) scrub = scr.Scrublet( counts_matrix, expected_doublet_rate=doublet_rate) #whole counts matrix print("Computing doublet predictions", file=sys.stderr) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) #collapse barcodes, scores, and predictions into a dict doublet_dict = { barcode: [doublet_scores[i], predicted_doublets[i]] for i, barcode in enumerate(barcodes) } #add doublet score and doublet prediction as column attributes: vlm.ca["doublet_scores"] = np.array( [doublet_dict[barcode][0] for barcode in vlm.ca['CellID']]) vlm.ca["doublet_predictions"] = np.array( [doublet_dict[barcode][1] for barcode in vlm.ca['CellID']]) return vlm
def annotate_doublets(mtx_fpath, feature_fpath, expected_doublet_rate2=0.06): if False: plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 counts_matrix = scipy.io.mmread(mtx_fpath).T.tocsc() genes = np.array(scr.load_genes(feature_fpath, delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate2) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) if False: scrub.plot_histogram() print('Running UMAP...') scrub.set_embedding( 'UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print('Done.') scrub.plot_embedding('UMAP', order_points=True) return ([doublet_scores, predicted_doublets])
def run_scrublet_rna(input_dir): counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc() genes = np.array(scr.load_genes(input_dir + 'features.tsv', delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.05) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) np.savetxt(input_dir + 'predicted_doublet_mask.txt', scrub.predicted_doublets_, fmt='%s') np.savetxt(input_dir + 'doublet_scores.txt', scrub.doublet_scores_obs_, fmt='%.4f')
def scrublet_c(sample, inDir, outDir, expected_doublet_rate, sim_doublet_ratio, ratio_df, out_df): print(sample, "start scrublet") counts_matrix = scipy.io.mmread(os.path.join(inDir, 'matrix.mtx')).T.tocsc() genes = np.array( scr.load_genes(os.path.join(inDir, 'genes.tsv'), delimiter='\t', column=1)) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expected_doublet_rate, sim_doublet_ratio=sim_doublet_ratio) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) scrub.plot_histogram() plt.savefig( os.path.join( outDir, "{0}_scrublet_doublet_score_histogram.pdf".format(sample))) print(sample, 'Running scrublet UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) print(sample, 'scrublet Done.') scrub.plot_embedding('UMAP', order_points=True) plt.savefig(os.path.join(outDir, "{0}_scrublet_UMAP.pdf".format(sample))) print(sample, "Done scrublet") ratio_df.loc['scrublet', sample] = scrub.detected_doublet_rate_ out_df['scrublet_doublet_scores'] = doublet_scores out_df['scrublet_doublets'] = predicted_doublets return ratio_df, out_df
import sys import argparse parser = argparse.ArgumentParser() parser.add_argument('-i', '--input', help='raw 10X file directory for input', type=str) parser.add_argument('-o', '--output', help='output directory', type=str, default="./") parser.add_argument('-n', '--name', help='name of output files', type=str, default="name") parser.add_argument('-r', '--doublet', help='expected doublet rate, default=0.06', type=float, default=0.06) parser.add_argument('-e', '--embed', help='plot UMAP and TSNE. True or False.', type=bool, default=False) args = parser.parse_args() #load counts matrix, genes, barcodes print("Loading counts matrix %s" % args.input + '/matrix.mtx', file=sys.stderr) counts_matrix = scipy.io.mmread(args.input + '/matrix.mtx').T.tocsc() print("Loading barcodes %s" % args.input + '/barcodes.tsv', file=sys.stderr) barcodes = np.array(scr.load_genes(args.input + 'barcodes.tsv', delimiter='t', column=0)) #initialize scrublet object print("Initializing scrublet object", file=sys.stderr) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=args.doublet) #whole counts matrix print("Computing doublet predictions", file=sys.stderr) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) #write scrublet output to file: print("Writing doublet predictions to %s" % args.output + "/" + args.name + "_predicted_doublets.tsv", file=sys.stderr) with open(args.output + "/" + args.name + "_predicted_doublets.tsv", 'w') as outfile: outfile.write("\t".join(["barcode", "doublet_score", "doublet_prediction"])+"\n")
import scrublet as scr import scipy.io import matplotlib.pyplot as plt import numpy as np import os plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 #filtered #input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/data/hg19/VENCHI_SampleBCITE/outs/filtered_feature_bc_matrix/' input_dir = '/share/ScratchGeneral/briglo/scRNA/venchi/outputs/seurat/' counts_matrix = scipy.io.mmread(input_dir + 'combined.human.mtx').T.tocsc() genes = np.array(scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=0)) print('Counts matrix shape: {} rows, {} columns'.format(counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) #Counts matrix shape: 12865 rows, 32738 columns #Number of genes in gene list: 32738 scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30, get_doublet_neighbor_parents=True)
import scrublet as rc import matplotlib.pyplot as plt import scipy.io from scipy.sparse import csc_matrix import numpy as np wd = "/restricted/projectnb/camplab/home/syyang/contamination/data/pbmc/4k/" counts = scipy.io.mmread( wd + 'data/matrix.mtx' ) geneIndex = ((counts > 2).sum( axis = 1 ) > 2 ) counts_filter = counts.toarray()[ np.array(geneIndex).reshape(-1), : ] print( counts_filter.shape) counts_csc = csc_matrix( counts_filter.T ) genes = np.array( rc.load_genes( wd + 'data/genes.tsv', delimiter='\t', column=1)) [ np.array(geneIndex).reshape(-1) ] scrub = rc.Scrublet(counts_csc, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) np.save( "doublet_scores.npy", doublet_scores ) np.save( "predicted_doublets.npy", predicted_doublets * 1 )
print('Arguments:', len(sys.argv)) print('List:', str(sys.argv)) var_number = float(sys.argv[3]) print(var_number) plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 ## Basic run with scrublet input_dir = os.path.join(sys.argv[1]) counts_matrix = scipy.io.mmread(input_dir + 'matrix.mtx').T.tocsc() genes = np.array( scr.load_genes(input_dir + 'genes.tsv', delimiter='\t', column=1)) # Use with the raw data print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.15, sim_doublet_ratio=2) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=150, min_gene_variability_pctl=var_number, n_prin_comps=30) scrub.call_doublets(threshold=0.40) outdir = sys.argv[2]
import scipy.io import matplotlib.pyplot as plt import numpy as np import os plt.rcParams['font.family'] = 'sans-serif' plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 tag = 'FFT4G_10x' output_dir = '/home/jovyan/snSeq_QCandAnalysis/scrublet' input_dir = '/home/jovyan/data/snQCandAnalysis/FFT4G_10x/filtered_feature_bc_matrix' counts_matrix = scipy.io.mmread(input_dir + '/matrix.mtx.gz').T.tocsc() genes = np.array( scr.load_genes(input_dir + '/features.tsv', delimiter='\t', column=1)) print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06) doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) predicted_doublets = predicted_doublets * 1 predicted_doublets = predicted_doublets.astype(int) detected_doublets_rate = round(scrub.detected_doublet_rate_, 4) overall_doublets_rate = round(scrub.overall_doublet_rate_, 4) np.savetxt(output_dir + '/' + tag + '_' + 'doublets_scores.txt',
if len(sys.argv) == 1: print('input file prefix: python scrublet_doublet.py FEL011_S') exit() else: prefix = sys.argv[1] print(prefix) #plt.rcParams['font.family'] = 'sans-serif' #plt.rcParams['font.sans-serif'] = 'Arial' plt.rc('font', size=14) plt.rcParams['pdf.fonttype'] = 42 counts_matrix = scipy.io.mmread(prefix + '.matrix.mtx').T.tocsc() genes = np.array( scr.load_genes(prefix + '.genes.tsv', delimiter='\t', column=0)) cells = pd.read_table(prefix + '.barcodes.tsv', header=None) cells.columns = ["Cell.ID"] print('Counts matrix shape: {} rows, {} columns'.format( counts_matrix.shape[0], counts_matrix.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) #indexnames = list(counts_matrix.index) #columnnames = list(counts_matrix.columns) #print('10 index values: {}'.format(indexnames[1:10])) #print('10 column values: {}'.format(counts_matrix[1:3,1:3])) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=0.06)
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, epilog='author: {0} mail: {1}'.format(__author__, __mail__)) parser.add_argument('-m', '--mtx', help='cellranger分析结果中的matrix.mtx', dest='mtx', required=True) parser.add_argument('-f', '--feature', help='cellranger分析结果中的feature.csv或genes.csv', dest='feature', required=True) parser.add_argument('-o', '--outdir', help='结果输出目录', dest='outdir', required=True) parser.add_argument('-s', '--sampleName', help='样本名', dest='sampleName', required=True) parser.add_argument('-e', '--expectedDoubletRate', help='细胞结团率', dest='expectedDoubletRate', type=float, required=True) parser.add_argument('-p', '--pc', help='PC值', dest='pc', type=int, default=30) args = parser.parse_args() logging.basicConfig( level=logging.DEBUG, format= "%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s - %(message)s" ) logging.info("开始分析") # plt.rcParams['font.family'] = 'sans-serif' # plt.rcParams['font.sans-serif'] = 'Arial' # plt.rc('font', size=14) # plt.rcParams['pdf.fonttype'] = 42 pc = args.pc expectedDoubletRate = args.expectedDoubletRate sampleName = args.sampleName #Load counts matrix and gene list counts_matrix = scipy.io.mmread(args.mtx).T.tocsc() genes = np.array(scr.load_genes(args.feature, delimiter='\t', column=1)) scrub = scr.Scrublet(counts_matrix, expected_doublet_rate=expectedDoubletRate) #Run the default pipeline doublet_scores, predicted_doublets = scrub.scrub_doublets( min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=pc) #Plot doublet score histograms for observed transcriptomes and simulated doublets scrub.call_doublets(threshold=0.1) scrub.plot_histogram() plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_Histogram.png') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) scrub.plot_embedding('UMAP', order_points=True) plt.savefig(args.outdir + "/" + sampleName + '_Scrublet_UMAP.png') #output the log file #expected doublet rate、 detected doublet rate、 doublet threshold、overall doublet rate logging.info( "patientID\texpected_doublet_rate\tdetected_doublet_rate\toverall_doublet_rate\tthreshold\tPC\n" ) logging.info( "%s\t%.4f\t%.4f\t%.4f\t%.4f\t%s\n" % (sampleName, scrub.expected_doublet_rate, scrub.detected_doublet_rate_, scrub.overall_doublet_rate_, scrub.threshold_, pc)) #output the doublet status of every single cell with open(args.outdir + "/" + sampleName + ".predictDoublet_scrublet.txt", "w+") as fo: for i in scrub.predicted_doublets_: fo.write("%s\n" % (i))
import os import time import sys input_dir = sys.argv[1] + "/" # The raw counts matrix (E) should be a scipy sparse CSC matrix # with cells as rows and genes as columns if os.path.isfile(input_dir + '/gene_count.npz'): E = scipy.sparse.load_npz(input_dir + '/gene_count.npz') else: E = scipy.io.mmread(input_dir + '/gene_count.mtx').T.tocsc() scipy.sparse.save_npz(input_dir + '/gene_count.npz', E, compressed=True) genes = np.array(scr.load_genes(input_dir + 'df_gene.tsv', delimiter='\t', column=1)) print('Expression matrix shape: {} rows, {} columns'.format(E.shape[0], E.shape[1])) print('Number of genes in gene list: {}'.format(len(genes))) scrub = scr.Scrublet(E, expected_doublet_rate=0.05) doublet_scores, predicted_doublets = scrub.scrub_doublets(min_counts=2, min_cells=3, min_gene_variability_pctl=85, n_prin_comps=30) scrub.call_doublets(threshold=0.22) scrub.plot_histogram() plt.savefig(input_dir + "/hist1.png") print('Running UMAP...') scrub.set_embedding('UMAP', scr.get_umap(scrub.manifold_obs_, 10, min_dist=0.3)) plt.savefig(input_dir + "/umap.png")