def get_tenx(samples): tenxs = [] for sample in samples: tenx = TenxDataStorage(sample) tenx.download() tenxs.append(TenxAnalysis(tenx.tenx_path)) return tenxs
def Analysis(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid) if not os.path.exists(cellassign_analysis): os.makedirs(cellassign_analysis) pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid)) assert os.path.exists(pyfit), "No Pyfit Found." pyfit = pickle.load(open(pyfit, "rb")) marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix) cell_types = marker_list.celltypes() if "B cell" not in cell_types: cell_types.append("B cell") celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types) tsne_by_cell_type(qc.sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) umap_by_cell_type(qc.sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) open(finished, "w").write("Completed")
def Run(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis,sampleid) plots = qc.plots cellassign = os.path.join(os.path.split(plots)[0],"cellassignanalysis") results = Results(config.jobpath) results.add_analysis(tenx.tenx_path) results.add_sce(qc.qcdsce) umi = os.path.join(plots,"umi.png") mito = os.path.join(plots,"mito.png") ribo = os.path.join(plots, "ribo.png") total_counts = os.path.join(plots, "total_counts.png") tfbc = os.path.join(plots, "total_features_by_counts.png") tcvfc = os.path.join(plots, "total_counts_v_features_by_counts.png") celltypes = os.path.join(cellassign, "cell_types.png") results.add_plot(umi,"UMI Distribution") results.add_plot(mito,"Mito Distribution") results.add_plot(ribo,"Ribo Distribution") results.add_plot(total_counts,"Total Counts Distribution") results.add_plot(tcvfc,"Total Counts") results.add_plot(tcvfc,"Total Features by Counts") results.add_plot(celltypes,"Cell Types") exportMD(results) exportUpload(results) open(finished,"w").write("Completed")
def Search(sampleid): tenxs = [] tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path print(analysis_path) tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) tenxs.append(tenx_analysis.adata(qc.sce)) print ("Loading main sce {}".format(sampleid)) sys.stdout.flush() samples = glob.glob("../../*/runs/.cache/*/metrics_summary.csv") for sample in samples: print ("Loading project sample {}".format(sample)) sys.stdout.flush() sample_rel_path = os.path.split(sample)[0] sid = sample_rel_path.split("/")[-1] sidsce = os.path.join(sample_rel_path,"{0}.rdata".format(sid)) if not os.path.exists(sidsce): print("Not found",sidsce) continue tenx_analysis = TenxAnalysis(sample_rel_path) tenx_analysis.load() tenx_analysis.extract() tenxs.append(tenx_analysis.adata(sidsce)) print ("Finished project tree search.") sys.stdout.flush() return tenxs
def Run(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) CellAssign.run(qc.sce, config.rho_matrix, ".cache/{}/celltypes.rdata".format(sampleid)) open(finished, "w").write("Completed")
def __init__(self, sampleids, chem="v2", output="./"): self.output = output self.samples = sampleids self.tenxs = [] for sampleid in self.samples: tenx = TenxDataStorage(sampleid, version=chem) tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() self.tenxs.append(tenx_analysis)
def main(): sample = "patient2" tenx = TenxDataStorage(sample, version="v2") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() output = "/igo_large/scratch/test_kallisto" fastq_directory = FastQDirectory( "/igo_large/scratch/allen/bams/xfastqs2/McGilvery_Sonya__TLH_MissingLibrary_1_CB8R9ANXX/", sample, output) krunner = Kallisto(fastq_directory, tenx_analysis) krunner.de()
def Run(sampleid, before, finished): print("Running QC.") tenx = TenxDataStorage(sampleid, version="v3") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() print("Extracted.") qc = QualityControl(tenx_analysis, sampleid) qc.run(mito=config.mito) print("Uploading") qc.upload_raw() qc.upload() open(finished, "w").write("Completed")
def Run(sampleid, before, finished, use_corrected=False): if use_corrected and os.path.exists(".cache/corrected/"): sce = ".cache/corrected/corrected_sce.rdata" if not os.path.exists(sce): utils = DropletUtils() utils.read10xCounts(".cache/corrected/", ".cache/corrected/corrected_sce.rdata") else: tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) sce = qc.sce if not os.path.exists(".cache/{}/celltypes.rdata".format(sampleid)): CellAssign.run(sce, config.rho_matrix, ".cache/{}/celltypes.rdata".format(sampleid)) open(finished, "w").write("Completed")
def Run(sampleid, before, finished): clustering = ".cache/{}/clustering/".format(sampleid) if not os.path.exists(clustering): os.makedirs(clustering) cluster_results = os.path.join(clustering, "{}_clusters.pkl".format(sampleid)) tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) if not os.path.exists(cluster_results): clusters = tenx_analysis.clusters(qc.sce) pickle.dump(clusters, open(cluster_results, "wb")) else: clusters = pickle.load(open(cluster_results, "rb")) tsne_by_cluster(qc.sce, clusters, sampleid, clustering) umap_by_cluster(qc.sce, clusters, sampleid, clustering) open(finished, "w").write("Completed")
def Analysis(sampleid, before, finished, use_corrected=False): if use_corrected and os.path.exists(".cache/corrected"): sce = ".cache/corrected/corrected_sce.rdata" if not os.path.exists(sce): utils = DropletUtils() utils.read10xCounts(".cache/corrected/", ".cache/corrected/corrected_sce.rdata") filtered_sce = sce else: tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) filtered_sce = os.path.join(os.path.split(qc.sce)[0], "sce_cas.rdata") cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid) if not os.path.exists(cellassign_analysis): os.makedirs(cellassign_analysis) pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid)) assert os.path.exists(pyfit), "No Pyfit Found." pyfit = pickle.load(open(pyfit, "rb")) marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix) cell_types = marker_list.celltypes() if "B cell" not in cell_types: cell_types.append("B cell") celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types) tsne_by_cell_type(filtered_sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) umap_by_cell_type(filtered_sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) open(finished, "w").write("Completed")
def load(self): if not os.path.exists(self.directory): self.sample = self.directory if not os.path.exists(".cache/{}".format(self.sample)): cloud_storage = TenxDataStorage(self.sample) self.directory = cloud_storage.download() else: self.directory = ".cache/{}".format(self.sample) self.path = self.directory v3_path_raw = self.raw_gene_bc_matrices = os.path.join( self.path, 'raw_feature_bc_matrix') v2_path_raw = self.raw_gene_bc_matrices = os.path.join( self.path, 'raw_gene_bc_matrices') if os.path.exists(v3_path_raw): self.raw_gene_bc_matrices = v3_path_raw self.detected_version = "v3" elif os.path.exists(v2_path_raw): self.raw_gene_bc_matrices = v2_path_raw self.detected_version = "v2" elif os.path.exists(v3_path_raw + "_mex"): self.raw_gene_bc_matrices = v3_path_raw + "_mex" self.detected_version = "v3" elif os.path.exists(v2_path_raw + "_mex"): self.raw_gene_bc_matrices = v2_path_raw + "_mex" self.detected_version = "v2" else: print( "No Raw Matrices folder found -- Check dir name (raw_feature_bc_matrix or raw_gene_bc_matrices)" ) v3_path_filtered = os.path.join(self.path, 'filtered_feature_bc_matrix') v2_path_filtered = os.path.join(self.path, 'filtered_gene_bc_matrices') if os.path.exists(v3_path_filtered): self.filtered_gene_bc_matrices = v3_path_filtered self.detected_version = "v3" elif os.path.exists(v2_path_filtered): self.filtered_gene_bc_matrices = v2_path_filtered self.detected_version = "v2" elif os.path.exists(v3_path_filtered + "_mex"): self.filtered_gene_bc_matrices = v3_path_filtered + "_mex" self.detected_version = "v3" elif os.path.exists(v2_path_filtered + "_mex"): self.filtered_gene_bc_matrices = v2_path_filtered + "_mex" self.detected_version = "v2" else: print( "No Filtered Matrices folder found -- Check dir name (filtered_feature_bc_matrix or filtered_gene_bc_matrices)" ) self.clustering = os.path.join(self.path, 'analysis/clustering') self.matrix = os.path.join(self.path, "") self.projection = os.path.join( self.path, 'analysis/pca/10_components/projection.csv') self.cellranger_tsne = os.path.join( self.path, 'analysis/tsne/2_components/projection.csv') self.summary = os.path.join(self.path, "web_summary.html") self.metrics_summary = os.path.join(self.path, "metrics_summary.csv") self.top_level = "/".join(self.path.split("/")[:-3]) self.baseobj = "sce.rdata" self.qcdobj = "qcdsce.rdata" self.rdata = os.path.join(self.directory, self.baseobj) self.qcdrdata = os.path.join(self.directory, self.qcdobj)
def RunDownload(sampleids, finished): for i, sample in enumerate(sampleids): tenx = TenxDataStorage(sample) path = tenx.download() path_json = {sample: path} open(finished(i), "w").write(json.dumps(path_json))
def run_transcript(self, fastqs=[]): matrices = dict() assert len(fastqs) == len( self.samples), "Provide fastq object for each sample." for sampleid, fastq in zip(self.samples, self.fastqs): tenx = TenxDataStorage(sampleid, version="v2") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() self.krunner = Kallisto(fastqs, tenx_analysis, chem=chem) self.krunner.run_pseudo() self.krunner.run_bus() matrix = self.krunner.design_matrix() matrices[sampleid] = matrix self.matrices = matrices self.matrix1 = self.matrices[sampleids[0]] self.matrix2 = self.matrices[sampleids[1]] self.common_genes = set(self.matrix1.keys()).intersection( set(self.matrix2.keys())) self.model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') de_file = "{}_{}_de.tsv".format(self.samples[0], self.samples[1]) if not os.path.exists(de_file): return output = open( "{}_{}_de.tsv".format(self.samples[0], self.samples[1]), "w") output.write("Gene\tPValue\n") differential_genes = dict() for gene in tqdm.tqdm(self.common_genes): tcc_common = set(self.matrix1[gene].keys()).intersection( set(self.matrix2[gene].keys())) if len(tcc_common) == 0: continue Y = [] X = [] cells1 = list( itertools.chain.from_iterable([ list(self.matrix1[gene][tcc].keys()) for tcc in tcc_common ])) cells2 = list( itertools.chain.from_iterable([ list(self.matrix2[gene][tcc].keys()) for tcc in tcc_common ])) if len(cells1) == 0 or len(cells2) == 0: continue for cell in cells1: Y.append(self.samples[0]) predictors = [] for tcc in tcc_common: try: predictors.append(self.matrix1[gene][tcc][cell]) except KeyError: predictors.append(0) X.append(predictors) for cell in cells2: Y.append(self.samples[1]) predictors = [] for tcc in tcc_common: try: predictors.append(self.matrix2[gene][tcc][cell]) except KeyError: predictors.append(0) X.append(predictors) classes = set(Y) Y = numpy.array(Y) X = numpy.array(X) if Y.shape[0] < 2 or len(classes) == 1: continue self.model.fit(X, Y) null_prob = 2.0 / float(Y.shape[0]) * numpy.ones(Y.shape) df = X.shape[1] alt_prob = self.model.predict_proba(X) alt_log_likelihood = -log_loss(Y, alt_prob, normalize=False) null_log_likelihood = -log_loss(Y, null_prob, normalize=False) G = 2 * (alt_log_likelihood - null_log_likelihood) p_value = chi2.sf(G, df) differential_genes[gene] = p_value output.write("{}\t{}\n".format(gene, p_value)) sorted_genes = sorted(differential_genes.items(), key=operator.itemgetter(1)) print("**************** Differential Genes ********************") for gene, pvalue in sorted_genes[:100]: print(gene, pvalue) output.close() else: differential_genes = dict() differential_genes_adj = dict() genes = open(de_file, "r").read().splitlines() genes.pop(0) _genes = [] pvalues = [] adjpvalues = [] for gene in genes: gene, pvalue = gene.split() differential_genes[gene] = float(pvalue) pvalues.append(float(pvalue)) _genes.append(gene) adj_pvalues = list(multitest.multipletests(pvalues)[1]) print(adj_pvalues) for gene, pvalue, adjp in zip(_genes, pvalues, adj_pvalues): differential_genes_adj[gene] = adjp sorted_genes = sorted(differential_genes_adj.items(), key=operator.itemgetter(1)) thresholds = (0.05, 0.01, 0.001) import collections sig_genes = collections.defaultdict(list) for gene, pvalue in sorted_genes: for threshold in thresholds: if pvalue < threshold: sig_genes[str(threshold)].append(gene) print("**************** Differential Genes ********************") for thresh, sig_genes in sig_genes.items(): print(thresh, len(sig_genes)) for gene, pvalue in sorted_genes[:100]: print(gene, pvalue) return sorted_genes