def umap_by_gene(rdata, gene, prefix, pcs): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) tsne_dims = sce.reducedDims["UMAP"] barcodes = sce.colData["Barcode"] transcripts = sce.rowData["Symbol"] adata = tenx.create_scanpy_adata(barcodes=barcodes, transcripts=symbols) assert len(barcodes) == len(adata[:, gene]) expression = dict(zip(barcodes, adata[:, gene])) tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] for barcode in barcodes: clusters.append(float(expression[barcode])) x.append(x_coded[barcode]) y.append(y_coded[barcode]) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("PCA - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("figures/umap_by_{}.png".format(gene))
def plot_by_genes(rdata, tenx_analysis, genes, prefix, rep, pcs): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) tsne_dims = sce.getReducedDims(rep) barcodes = sce.colData["Barcode"] transcripts = sce.rowData["Symbol"] adata = tenx.create_scanpy_adata(barcodes=barcodes, transcripts=transcripts) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) if not os.path.exists("figures/expression"): os.makedirs("figures/expression") x = [] y = [] for barcode in barcodes: x.append(x_coded[barcode]) y.append(y_coded[barcode]) for gene in genes: expression = [] for barcode in barcodes: val = adata[barcode, gene].X expression.append(float(val)) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=expression, alpha=0.85) ax.set_title("{} Counts".format(gene)) ax.legend() plt.tight_layout() plt.savefig("figures/expression/expression_{}.png".format(gene))
def Run(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis,sampleid) plots = qc.plots cellassign = os.path.join(os.path.split(plots)[0],"cellassignanalysis") results = Results(config.jobpath) results.add_analysis(tenx.tenx_path) results.add_sce(qc.qcdsce) umi = os.path.join(plots,"umi.png") mito = os.path.join(plots,"mito.png") ribo = os.path.join(plots, "ribo.png") total_counts = os.path.join(plots, "total_counts.png") tfbc = os.path.join(plots, "total_features_by_counts.png") tcvfc = os.path.join(plots, "total_counts_v_features_by_counts.png") celltypes = os.path.join(cellassign, "cell_types.png") results.add_plot(umi,"UMI Distribution") results.add_plot(mito,"Mito Distribution") results.add_plot(ribo,"Ribo Distribution") results.add_plot(total_counts,"Total Counts Distribution") results.add_plot(tcvfc,"Total Counts") results.add_plot(tcvfc,"Total Features by Counts") results.add_plot(celltypes,"Cell Types") exportMD(results) exportUpload(results) open(finished,"w").write("Completed")
def Analysis(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid) if not os.path.exists(cellassign_analysis): os.makedirs(cellassign_analysis) pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid)) assert os.path.exists(pyfit), "No Pyfit Found." pyfit = pickle.load(open(pyfit, "rb")) marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix) cell_types = marker_list.celltypes() if "B cell" not in cell_types: cell_types.append("B cell") celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types) tsne_by_cell_type(qc.sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) umap_by_cell_type(qc.sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) open(finished, "w").write("Completed")
def Search(sampleid): tenxs = [] tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path print(analysis_path) tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) tenxs.append(tenx_analysis.adata(qc.sce)) print ("Loading main sce {}".format(sampleid)) sys.stdout.flush() samples = glob.glob("../../*/runs/.cache/*/metrics_summary.csv") for sample in samples: print ("Loading project sample {}".format(sample)) sys.stdout.flush() sample_rel_path = os.path.split(sample)[0] sid = sample_rel_path.split("/")[-1] sidsce = os.path.join(sample_rel_path,"{0}.rdata".format(sid)) if not os.path.exists(sidsce): print("Not found",sidsce) continue tenx_analysis = TenxAnalysis(sample_rel_path) tenx_analysis.load() tenx_analysis.extract() tenxs.append(tenx_analysis.adata(sidsce)) print ("Finished project tree search.") sys.stdout.flush() return tenxs
def Run(sampleid, species, umi_plot, mito_plot, ribo_plot, counts_plot, raw_sce): print("Running QC.") tenx = TenxDataStorage(sampleid) tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) qc.run(mito=config.mito) plots = qc.plots umi = os.path.join(plots, "umi.png") mito = os.path.join(plots, "mito.png") ribo = os.path.join(plots, "ribo.png") counts = os.path.join(plots, "counts.png") cvf = os.path.join(plots, "total_counts_v_features.png") results = os.path.join(config.jobpath, "results") if not os.path.exists(results): os.makedirs(results) shutil.copyfile(umi, umi_plot) shutil.copyfile(mito, mito_plot) shutil.copyfile(ribo, ribo_plot) shutil.copyfile(counts, counts_plot) shutil.copyfile(qc.sce, raw_sce)
def RunExtract(sample_to_path, rdata_path): sample = json.loads(open(sample_to_path, "r").read()) sampleid, path = list(sample.items()).pop() tenx_analysis = TenxAnalysis(path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) if not os.path.exists(qc.sce): qc.run(mito=config.mito) shutil.copyfile(qc.sce, rdata_path)
def scvis_by_cluster_markers(rdata, tenx_analysis, prefix, pcs, embedding_file): try: tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.markers_by_clusters( sce, rep="SCVIS", pcs=pcs, embedding_file=embedding_file) except Exception as e: return
def cluster_markers(rdata, tenx_analysis, rep, pcs, embedding_file, prefix): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) markers = tenx.markers_by_clusters(sce, rep="PCA", pcs=pcs) markers_by_cluster = list(zip(*markers["rank_genes_groups"]["names"])) for i, markers in enumerate(markers_by_cluster): cluster_prefix = "Cluster {} {}".format(i, prefix) plot_by_markers(rdata, tenx_analysis, markers, cluster_prefix, rep, pcs, embedding_file)
def Run(sampleid, before, finished): tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) CellAssign.run(qc.sce, config.rho_matrix, ".cache/{}/celltypes.rdata".format(sampleid)) open(finished, "w").write("Completed")
def __init__(self, sampleids, chem="v2", output="./"): self.output = output self.samples = sampleids self.tenxs = [] for sampleid in self.samples: tenx = TenxDataStorage(sampleid, version=chem) tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() self.tenxs.append(tenx_analysis)
def Run(sampleid, before, finished): print("Running QC.") tenx = TenxDataStorage(sampleid, version="v3") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() print("Extracted.") qc = QualityControl(tenx_analysis, sampleid) qc.run(mito=config.mito) print("Uploading") qc.upload_raw() qc.upload() open(finished, "w").write("Completed")
def main(): sample = "patient2" tenx = TenxDataStorage(sample, version="v2") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() output = "/igo_large/scratch/test_kallisto" fastq_directory = FastQDirectory( "/igo_large/scratch/allen/bams/xfastqs2/McGilvery_Sonya__TLH_MissingLibrary_1_CB8R9ANXX/", sample, output) krunner = Kallisto(fastq_directory, tenx_analysis) krunner.de()
def cell_type_by_cluster(rdata, cell_assign_fit, tenx_analysis, prefix): tenx = TenxAnalysis(tenx_analysis) tenx.load() fit = pickle.load(open(cell_assign_fit, "rb")) cell_types = dict(zip(fit["Barcode"], fit["cell_type"])) sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce) clusters = dict(zip(sce.colData["Barcode"], cluster_labels)) data_by_cluster = collections.defaultdict(list) data_by_celltype = collections.defaultdict(list) cluster = [] cell_type = [] for barcode, cell in cell_types.items(): try: cluster.append(str(clusters[barcode])) cell_type.append(cell) data_by_celltype[cell] = str(clusters[barcode]) data_by_cluster[str(clusters[barcode])] = cell except Exception as e: continue f, ax = plt.subplots(figsize=(16, 8)) counts = collections.defaultdict(lambda: collections.defaultdict(int)) for cluster, ctype in zip(cluster, cell_type): counts[cluster][ctype] += 1 fclusters = [] fcelltypes = [] fpercentages = [] for cluster, ctype in counts.items(): total = float(sum(ctype.values())) for cell in cell_type: fcelltypes.append(cell) fclusters.append(cluster) if cell in ctype: fpercentages.append(float(ctype[cell]) / total) else: fpercentages.append(0.0) df = pandas.DataFrame({ "Cluster": fclusters, "Cell Type": fcelltypes, "Percentage": fpercentages }) ax = sns.barplot(x="Cluster", y="Percentage", hue="Cell Type", data=df, palette="tab10") ax.set_title("Cell Type by Cluster - {}".format(prefix)) plt.tight_layout() plt.savefig("figures/cell_type_by_cluster.png")
def Run(sampleid, before, finished, use_corrected=False): if use_corrected and os.path.exists(".cache/corrected/"): sce = ".cache/corrected/corrected_sce.rdata" if not os.path.exists(sce): utils = DropletUtils() utils.read10xCounts(".cache/corrected/", ".cache/corrected/corrected_sce.rdata") else: tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) sce = qc.sce if not os.path.exists(".cache/{}/celltypes.rdata".format(sampleid)): CellAssign.run(sce, config.rho_matrix, ".cache/{}/celltypes.rdata".format(sampleid)) open(finished, "w").write("Completed")
def Run(sampleid, before, finished): clustering = ".cache/{}/clustering/".format(sampleid) if not os.path.exists(clustering): os.makedirs(clustering) cluster_results = os.path.join(clustering, "{}_clusters.pkl".format(sampleid)) tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) if not os.path.exists(cluster_results): clusters = tenx_analysis.clusters(qc.sce) pickle.dump(clusters, open(cluster_results, "wb")) else: clusters = pickle.load(open(cluster_results, "rb")) tsne_by_cluster(qc.sce, clusters, sampleid, clustering) umap_by_cluster(qc.sce, clusters, sampleid, clustering) open(finished, "w").write("Completed")
def Analysis(sampleid, before, finished, use_corrected=False): if use_corrected and os.path.exists(".cache/corrected"): sce = ".cache/corrected/corrected_sce.rdata" if not os.path.exists(sce): utils = DropletUtils() utils.read10xCounts(".cache/corrected/", ".cache/corrected/corrected_sce.rdata") filtered_sce = sce else: tenx = TenxDataStorage(sampleid, version="v3") tenx.download() analysis_path = tenx.tenx_path tenx_analysis = TenxAnalysis(analysis_path) tenx_analysis.load() tenx_analysis.extract() qc = QualityControl(tenx_analysis, sampleid) filtered_sce = os.path.join(os.path.split(qc.sce)[0], "sce_cas.rdata") cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid) if not os.path.exists(cellassign_analysis): os.makedirs(cellassign_analysis) pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid)) assert os.path.exists(pyfit), "No Pyfit Found." pyfit = pickle.load(open(pyfit, "rb")) marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix) cell_types = marker_list.celltypes() if "B cell" not in cell_types: cell_types.append("B cell") celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types) tsne_by_cell_type(filtered_sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) umap_by_cell_type(filtered_sce, pyfit, sampleid, cellassign_analysis, known_types=cell_types) open(finished, "w").write("Completed")
def run_transcript(self, fastqs=[]): matrices = dict() assert len(fastqs) == len( self.samples), "Provide fastq object for each sample." for sampleid, fastq in zip(self.samples, self.fastqs): tenx = TenxDataStorage(sampleid, version="v2") tenx.download() tenx_analysis = TenxAnalysis(tenx.tenx_path) tenx_analysis.load() tenx_analysis.extract() self.krunner = Kallisto(fastqs, tenx_analysis, chem=chem) self.krunner.run_pseudo() self.krunner.run_bus() matrix = self.krunner.design_matrix() matrices[sampleid] = matrix self.matrices = matrices self.matrix1 = self.matrices[sampleids[0]] self.matrix2 = self.matrices[sampleids[1]] self.common_genes = set(self.matrix1.keys()).intersection( set(self.matrix2.keys())) self.model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial') de_file = "{}_{}_de.tsv".format(self.samples[0], self.samples[1]) if not os.path.exists(de_file): return output = open( "{}_{}_de.tsv".format(self.samples[0], self.samples[1]), "w") output.write("Gene\tPValue\n") differential_genes = dict() for gene in tqdm.tqdm(self.common_genes): tcc_common = set(self.matrix1[gene].keys()).intersection( set(self.matrix2[gene].keys())) if len(tcc_common) == 0: continue Y = [] X = [] cells1 = list( itertools.chain.from_iterable([ list(self.matrix1[gene][tcc].keys()) for tcc in tcc_common ])) cells2 = list( itertools.chain.from_iterable([ list(self.matrix2[gene][tcc].keys()) for tcc in tcc_common ])) if len(cells1) == 0 or len(cells2) == 0: continue for cell in cells1: Y.append(self.samples[0]) predictors = [] for tcc in tcc_common: try: predictors.append(self.matrix1[gene][tcc][cell]) except KeyError: predictors.append(0) X.append(predictors) for cell in cells2: Y.append(self.samples[1]) predictors = [] for tcc in tcc_common: try: predictors.append(self.matrix2[gene][tcc][cell]) except KeyError: predictors.append(0) X.append(predictors) classes = set(Y) Y = numpy.array(Y) X = numpy.array(X) if Y.shape[0] < 2 or len(classes) == 1: continue self.model.fit(X, Y) null_prob = 2.0 / float(Y.shape[0]) * numpy.ones(Y.shape) df = X.shape[1] alt_prob = self.model.predict_proba(X) alt_log_likelihood = -log_loss(Y, alt_prob, normalize=False) null_log_likelihood = -log_loss(Y, null_prob, normalize=False) G = 2 * (alt_log_likelihood - null_log_likelihood) p_value = chi2.sf(G, df) differential_genes[gene] = p_value output.write("{}\t{}\n".format(gene, p_value)) sorted_genes = sorted(differential_genes.items(), key=operator.itemgetter(1)) print("**************** Differential Genes ********************") for gene, pvalue in sorted_genes[:100]: print(gene, pvalue) output.close() else: differential_genes = dict() differential_genes_adj = dict() genes = open(de_file, "r").read().splitlines() genes.pop(0) _genes = [] pvalues = [] adjpvalues = [] for gene in genes: gene, pvalue = gene.split() differential_genes[gene] = float(pvalue) pvalues.append(float(pvalue)) _genes.append(gene) adj_pvalues = list(multitest.multipletests(pvalues)[1]) print(adj_pvalues) for gene, pvalue, adjp in zip(_genes, pvalues, adj_pvalues): differential_genes_adj[gene] = adjp sorted_genes = sorted(differential_genes_adj.items(), key=operator.itemgetter(1)) thresholds = (0.05, 0.01, 0.001) import collections sig_genes = collections.defaultdict(list) for gene, pvalue in sorted_genes: for threshold in thresholds: if pvalue < threshold: sig_genes[str(threshold)].append(gene) print("**************** Differential Genes ********************") for thresh, sig_genes in sig_genes.items(): print(thresh, len(sig_genes)) for gene, pvalue in sorted_genes[:100]: print(gene, pvalue) return sorted_genes
def umap_by_cluster_markers(rdata, tenx_analysis, prefix, pcs): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) markers = tenx.markers_by_clusters(sce, rep="UMAP", pcs=pcs) print(markers.keys())