def test_raw_assay_type_equivelence(self): rdata = os.path.join(base_dir, "tests/example_sce.RData") sce_from_rdata = SingleCellExperiment.fromRData(rdata) tenx = DropletUtils() rs4_results = tenx.read10xCounts("/home/ceglian/data/raw_gene_bc_matrices/hg19/") sce_from_rs4 = SingleCellExperiment(rs4_results) self.assertEqual(type(sce_from_rdata.assays["counts"]),type(sce_from_rdata.assays["counts"]))
def test_save_and_load_rdata(self): print("Reading") tenx = DropletUtils() rs4_result = tenx.read10xCounts("/home/ceglian/data/raw_gene_bc_matrices/hg19/") sce = SingleCellExperiment.fromRS4(rs4_result) print("Writing") sce.save("tests/sce_1.rdata") print("Loading...") sce_saved = SingleCellExperiment.fromRData("tests/sce_1.rdata") print(sce_saved.assays["counts"].shape)
def test_symbol_retrieve(self): tenx = TenxAnalysis("tests/pre_igo") sce = TenX.read10xCounts(tenx) print(sce.rowData.keys()) example_rda = os.path.join(base_dir, "tests/example_sce.rda") sce = SingleCellExperiment.fromRData(example_rda) print(sce.rowData.keys()) tenx = DropletUtils() rs4_result = tenx.read10xCounts("tests/hg19/") sce = SingleCellExperiment.fromRS4(rs4_result) print(sce.rowData.keys()) example_rda = os.path.join(base_dir, "tests/example_copy_number.rda") sce = SingleCellExperiment.fromRData(example_rda) print(sce.rowData.keys()) print(sce.rownames) print(sce.colnames)
def scvis_by_cluster(rdata, tenx, prefix, pcs, embedding_file): # tenx = TenxAnalysis(tenx_analysis) # tenx.load() sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce, pcs=pcs) rows = open(embedding_file, "r").read().splitlines() dims = [] rows.pop(0) for row in rows: row = row.split("\t") row = list(map(float, row[1:])) dims.append(row) barcodes = sce.colData["Barcode"] print(dims) x = [] y = [] clusters = [] for barcode, dim in zip(barcodes, dims): x.append(dim[0]) y.append(dim[1]) clusters.append("Cluster {}".format(cluster_labels[barcode])) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("SCVIS - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("{}}/svis_by_cluster.png".format(prefix))
def test_call_empty_drops(self): rdata = os.path.join(base_dir, "tests/example_sce.RData") sce_from_rdata = SingleCellExperiment.fromRData(rdata) tenx = TenX() assay = sce_from_rdata.assays["counts"] values = tenx.emptyDrops(assay) print(values.keys())
def pca_by_cluster(rdata, tenx, prefix, pcs): # tenx = TenxAnalysis(tenx_analysis) # tenx.load() sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce, pcs=pcs) tsne_dims = sce.getReducedDims("PCA", n=pcs) barcodes = sce.colData["Barcode"] x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] for barcode, cluster in cluster_labels.items(): try: x_val = x_coded[barcode] y_val = y_coded[barcode] except Exception as e: continue x.append(x_val) y.append(y_val) clusters.append("Cluster {}".format(cluster)) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("PCA - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("{}/pca_by_cluster.png".format(prefix))
def tsne_by_cluster(rdata, tenx, prefix, pcs): sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce, pcs=pcs) tsne_dims = sce.reducedDims["TSNE"] barcodes = sce.colData["Barcode"] tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] embedding = dict() labels = dict() for barcode, cluster in cluster_labels.items(): clusters.append("Cluster {}".format(cluster)) x.append(x_coded[barcode]) y.append(y_coded[barcode]) embedding[barcode] = (x_coded[barcode], y_coded[barcode]) labels[barcode] = cluster embedding_str = json.dumps(embedding) output = open("{}/tsne_embedding.json".format(prefix), "w") output.write(embedding_str) output.close() output = open("{}/tsne_clusters.json".format(prefix), "w") clusters_str = json.dumps(labels) output.write(clusters_str) output.close() f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("TSNE - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("{}/tsne_by_cluster.png".format(prefix))
def adata(self, scepath, subset=None): if scepath is None: scepath = self.rdata scepath = os.path.abspath(scepath) print(scepath) sce = SingleCellExperiment.fromRData(scepath) return self.create_scanpy_adata(sce, subset=subset)
def test_clone_align(self): example_rda = os.path.join(base_dir, "tests/example_sce.rda") example_clonealign_fit = os.path.join( example_rda, "tests/example_clonealign_fit.rda") sce = SingleCellExperiment.fromRData(example_rda) clonealigner = CloneAlign() res = clonealigner.run(sce)
def pca_by_cell_type(rdata, cell_assign_fit, prefix): sce = SingleCellExperiment.fromRData(rdata) fit = pickle.load(open(cell_assign_fit, "rb")) tsne_dims = sce.getReducedDims("PCA") barcodes = sce.colData["Barcode"] cell_types = dict( zip(fit["Barcode"][:len(barcodes)], fit["cell_type"][:len(barcodes)])) #tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] for barcode, cluster in cell_types.items(): try: x_val = x_coded[barcode] y_val = y_coded[barcode] except Exception as e: continue try: clusters.append(cell_types[barcode]) except Exception as e: clusters.append("Other") x.append(x_val) y.append(y_val) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("PCA - Cell Type - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("figures/pca_by_celltype.png")
def umap_by_gene(rdata, gene, prefix, pcs): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) tsne_dims = sce.reducedDims["UMAP"] barcodes = sce.colData["Barcode"] transcripts = sce.rowData["Symbol"] adata = tenx.create_scanpy_adata(barcodes=barcodes, transcripts=symbols) assert len(barcodes) == len(adata[:, gene]) expression = dict(zip(barcodes, adata[:, gene])) tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] for barcode in barcodes: clusters.append(float(expression[barcode])) x.append(x_coded[barcode]) y.append(y_coded[barcode]) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("PCA - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("figures/umap_by_{}.png".format(gene))
def run(tenx, rdata, copy_number_data, clone_assignments, assay="counts", run_cmd=True): sce = SingleCellExperiment.fromRData(rdata) _genes = tenx.get_genes(sce) convert = tenx.gene_map(sce) genes = [] for gene in _genes: if gene in convert: genes.append(convert[gene]) else: genes.append(gene) adata = tenx.create_scanpy_adata(sce) matrix = adata.X.T assert assay in sce.assayNames, "Assay not present in SCE." if run_cmd: print("Calling CMD Clone Align.") if not os.path.exists("rdata/clone_align.rdata"): CloneAlign.run_command_line(adata, copy_number_data, clone_assignments, genes) if not os.path.exists("rdata/clone_align.rdata"): raise ValueError("Rscript 'run_clonealign.r' Failed.") cal = r.readRDS("rdata/cell_assign_fit.rdata") else: CloneAlignInterface = importr("clonealign") cal = CloneAlignInterface.clonealign(matrix, cnv_data) robjects.r.assign("clone_align_fit", clone_align_fit) robjects.r("saveRDS(clone_align_fit, file='{}')".format(filename))
def plot_by_genes(rdata, tenx_analysis, genes, prefix, rep, pcs): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) tsne_dims = sce.getReducedDims(rep) barcodes = sce.colData["Barcode"] transcripts = sce.rowData["Symbol"] adata = tenx.create_scanpy_adata(barcodes=barcodes, transcripts=transcripts) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) if not os.path.exists("figures/expression"): os.makedirs("figures/expression") x = [] y = [] for barcode in barcodes: x.append(x_coded[barcode]) y.append(y_coded[barcode]) for gene in genes: expression = [] for barcode in barcodes: val = adata[barcode, gene].X expression.append(float(val)) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=expression, alpha=0.85) ax.set_title("{} Counts".format(gene)) ax.legend() plt.tight_layout() plt.savefig("figures/expression/expression_{}.png".format(gene))
def read10xCountsRaw(tenx_analysis, output): tenx_analysis.load() utils = DropletUtils() counts = utils.read10xCounts(tenx_analysis.raw_matrices()) sce = SingleCellExperiment.fromRS4(counts) sce.save(output) return sce
def scvis_by_cell_type(rdata, cell_assign_fit, prefix, embedding_file): fit = pickle.load(open(cell_assign_fit, "rb")) sce = SingleCellExperiment.fromRData(rdata) barcodes = sce.colData["Barcode"] cell_types = dict( zip(fit["Barcode"][:len(barcodes)], fit["cell_type"][:len(barcodes)])) rows = open(embedding_file, "r").read().splitlines() dims = [] rows.pop(0) for row in rows: row = row.split("\t") row = list(map(float, row[1:])) dims.append(row) x = [] y = [] clusters = [] for barcode, dim in zip(barcodes, dims): try: clusters.append(cell_types[barcode]) except KeyError as e: clusters.append("Other") x.append(dim[0]) y.append(dim[1]) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("SCVIS - Cell Type - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("figures/scvis_by_cell_type_{}.png".format(prefix))
def test_assay_names_rdata(self): expected_assays = ['BatchCellMeans', 'BaseCellMeans', 'BCV', 'CellMeans', 'TrueCounts', 'counts'] rdata = os.path.join(base_dir, "tests/example_sce.RData") sce_from_rdata = SingleCellExperiment.fromRData(rdata) assays = sce_from_rdata.assays assay_names = list(assays.keys()) for assay in assay_names: self.assertTrue(assay in expected_assays)
def umap_by_cluster(rdata, cluster_labels, sampleid, directory): sce = SingleCellExperiment.fromRData(rdata) umap_dims = sce.reducedDims["UMAP"] barcodes = sce.colData["Barcode"] umap_dims = numpy.array(umap_dims).reshape(2, len(barcodes)) filename = os.path.join(directory, "umap_by_cluster.png") reduced_dims_by_cluster(cluster_labels, umap_dims, barcodes, filename, "UMAP")
def cluster_markers(rdata, tenx_analysis, rep, pcs, embedding_file, prefix): tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) markers = tenx.markers_by_clusters(sce, rep="PCA", pcs=pcs) markers_by_cluster = list(zip(*markers["rank_genes_groups"]["names"])) for i, markers in enumerate(markers_by_cluster): cluster_prefix = "Cluster {} {}".format(i, prefix) plot_by_markers(rdata, tenx_analysis, markers, cluster_prefix, rep, pcs, embedding_file)
def test_cell_assign_em(self): example_rda = os.path.join(base_dir, "tests/cell_assign_test.RData") sce = SingleCellExperiment.fromRData(example_rda) cellassigner = CellAssign() rho = GeneMarkerMatrix(genes=[ "Gene161", "Gene447", "Gene519", "Gene609", "Gene677", "Gene750", "Gene754", "Gene860", "Gene929", "Gene979" ], cells=["Groups1", "Groups2"]) res = cellassigner.run_em(sce, rho)
def scvis_by_cluster_markers(rdata, tenx_analysis, prefix, pcs, embedding_file): try: tenx = TenxAnalysis(tenx_analysis) tenx.load() sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.markers_by_clusters( sce, rep="SCVIS", pcs=pcs, embedding_file=embedding_file) except Exception as e: return
def test_expression_normalization(self): rdata = os.path.join(base_dir, "tests/example_sce.RData") sce_from_rdata = SingleCellExperiment.fromRData(rdata) tenx = TenX() assay = sce_from_rdata.assays["counts"] cpm = tenx.calculateCPM(assay) tpm = tenx.calculateTPM(assay) fpkm = tenx.calculateFPKM(assay) assert cpm.shape == assay.shape assert tpm.shape == assay.shape assert fpkm.shape == assay.shape
def exportRData(rdata, directory, delim="\t"): if not os.path.exists(directory): os.makedirs(directory) sce = SingleCellExperiment.fromRData(rdata) output = open(os.path.join(directory,"meta.txt"),"w") output.write("sizeFactors: " + str(sce.sizeFactors) + "\n") output.write("reducedDims: " + str(sce.reducedDims) + "\n") for assay in sce.assayNames: filename = os.path.join(directory,"{}.csv".format(assay)) print(filename) dataframe = sce.assay(assay) dataframe.to_csv(filename,sep=delim)
def tsne_by_cell_type(rdata, fit, sampleid, directory, known_types=None): sce = SingleCellExperiment.fromRData(rdata) tsne_dims = sce.reducedDims["TSNE"] barcodes = sce.colData["Barcode"] cell_types = dict(zip(fit["Barcode"], fit["cell_type"])) tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) filename = os.path.join(directory, "tsne_by_cell_type.png") reduced_dims_by_cell_type(cell_types, tsne_dims, barcodes, filename, "TSNE", known_types=known_types)
def test_clone_align(self): example_rda = os.path.join(base_dir, "tests/example_sce.rda") sce = SingleCellExperiment.fromRData(example_rda) rowdata = sce.rowData cnv_data = [] for column in ["A","B","C"]: column = rowdata[column] cnv_data.append(column) cnv_data = numpy.transpose(numpy.array(cnv_data)) print(cnv_data.shape) clonealigner = CloneAlign() result = clonealigner.run(sce,cnv_data) assert len(result["clone"]) == 200
def IntegratedSummary(sce, sampleid, report): if not os.path.exists("viz/"): os.makedirs("viz") if not os.path.exists("viz/html/"): os.makedirs("viz/html/") if not os.path.exists("viz/{}.json".format(sampleid)): sce = SingleCellExperiment.fromRData(sce) column_data = dump_all_coldata(sce) patient_data = collections.defaultdict(dict) patient_data[sampleid]["celldata"] = column_data gene_data = dump_all_rowdata(sce) patient_data[sampleid]["genedata"] = gene_data logcounts = sce.assays["logcounts"].todense().tolist() log_count_matrix = collections.defaultdict(dict) for symbol, row in zip(gene_data["Symbol"], logcounts): for barcode, cell in zip(column_data["Barcode"], row): if float(cell) != 0.0: log_count_matrix[barcode][symbol] = cell patient_data[sampleid]["log_count_matrix"] = dict(log_count_matrix) rdims = sce.reducedDims["UMAP"] barcodes = sce.colData["Barcode"] rdims = numpy.array(rdims).reshape(2, len(barcodes)) _celltypes = sce.colData["cell_type"] celltypes = [] for celltype in _celltypes: if celltype == "Monocyte.Macrophage": celltype = "Monocyte/Macrophage" else: celltype = celltype.replace(".", " ") celltypes.append(celltype) fit = dict(zip(barcodes, celltypes)) x_coded = dict(zip(barcodes, rdims[0])) y_coded = dict(zip(barcodes, rdims[1])) coords = dict() for barcode, celltype in fit.items(): try: x_val = int(x_coded[barcode]) y_val = int(y_coded[barcode]) except Exception as e: continue coords[barcode] = (x_val, y_val) patient_data[sampleid]["cellassign"] = fit patient_data[sampleid]["umap"] = coords patient_data["rho"] = GeneMarkerMatrix.read_yaml( config.rho_matrix).marker_list patient_data_str = json.dumps(patient_data) output = open("viz/{}.json".format(sampleid), "w") output.write(str(patient_data_str)) output.close() shutil.copyfile("viz/{}.json".format(sampleid), report)
def cell_type_by_cluster(rdata, cell_assign_fit, tenx_analysis, prefix): tenx = TenxAnalysis(tenx_analysis) tenx.load() fit = pickle.load(open(cell_assign_fit, "rb")) cell_types = dict(zip(fit["Barcode"], fit["cell_type"])) sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce) clusters = dict(zip(sce.colData["Barcode"], cluster_labels)) data_by_cluster = collections.defaultdict(list) data_by_celltype = collections.defaultdict(list) cluster = [] cell_type = [] for barcode, cell in cell_types.items(): try: cluster.append(str(clusters[barcode])) cell_type.append(cell) data_by_celltype[cell] = str(clusters[barcode]) data_by_cluster[str(clusters[barcode])] = cell except Exception as e: continue f, ax = plt.subplots(figsize=(16, 8)) counts = collections.defaultdict(lambda: collections.defaultdict(int)) for cluster, ctype in zip(cluster, cell_type): counts[cluster][ctype] += 1 fclusters = [] fcelltypes = [] fpercentages = [] for cluster, ctype in counts.items(): total = float(sum(ctype.values())) for cell in cell_type: fcelltypes.append(cell) fclusters.append(cluster) if cell in ctype: fpercentages.append(float(ctype[cell]) / total) else: fpercentages.append(0.0) df = pandas.DataFrame({ "Cluster": fclusters, "Cell Type": fcelltypes, "Percentage": fpercentages }) ax = sns.barplot(x="Cluster", y="Percentage", hue="Cell Type", data=df, palette="tab10") ax.set_title("Cell Type by Cluster - {}".format(prefix)) plt.tight_layout() plt.savefig("figures/cell_type_by_cluster.png")
def create_input_files(rdata, components, output): sce = SingleCellExperiment.fromRData(rdata) embedding = sce.getReducedDims("PCA", n=components) counts = [] for i in range(0, len(embedding), components): counts.append(embedding[i:i + (components)]) counts = numpy.array(counts[0]) print(counts.shape) header = [] for c in range(components): header.append("PC_{}".format(c)) header = "\t".join(header) filename = os.path.join(output, "matrix.tsv") numpy.savetxt(filename, counts, delimiter="\t", header=header) return filename
def qcd_sce(self): # if not os.path.exists(self.qcdrdata): # qc = QualityControl(self) # qc.build() # qc.filter() # # # TenX.read10xCountsFiltered(self,self.rdata) # # rscript = ScaterCode(self.directory).generate_script() # # cwd = os.getcwd() # # os.chdir(self.directory) # # print(os.getcwd()) # # cmd = ["Rscript",os.path.split(rscript)[-1],self.rdata,self.qcdrdata] # # subprocess.call(cmd) # # os.chdir(cwd) # print (self.qcdrdata) return SingleCellExperiment.fromRData(self.qcdrdata)
def umap_by_cluster(rdata, tenx, prefix, pcs): sce = SingleCellExperiment.fromRData(rdata) cluster_labels = tenx.clusters(sce, pcs=pcs) tsne_dims = sce.reducedDims["UMAP"] barcodes = sce.colData["Barcode"] tsne_dims = numpy.array(tsne_dims).reshape(2, len(barcodes)) x_coded = dict(zip(barcodes, tsne_dims[0])) y_coded = dict(zip(barcodes, tsne_dims[1])) x = [] y = [] clusters = [] for barcode, cluster in cluster_labels.items(): clusters.append("Cluster {}".format(cluster)) x.append(x_coded[barcode]) y.append(y_coded[barcode]) f, ax = plt.subplots(figsize=(10, 8)) sns.scatterplot(x=x, y=y, hue=clusters, alpha=0.85) ax.set_title("UMAP - Clusters - {}".format(prefix)) ax.legend() plt.tight_layout() plt.savefig("{}/umap_by_cluster.png".format(prefix))
def marker_analysis(sce, tenx, rho, cell_assign_fit, figure): sce = SingleCellExperiment.fromRData(sce) fit = pickle.load(open(cell_assign_fit, "rb")) gene_markers = [] for markers in rho.values(): gene_markers += markers _marker_genes = list(set(gene_markers)) convert = tenx.gene_map(sce) marker_genes = [] for gene in _marker_genes: try: marker_genes.append(convert[gene]) except KeyError: marker_genes.append(gene) print('No conversion for ', gene) print(marker_genes) adata = tenx.create_scanpy_adata(sce, fast_load=False) print(len(adata.obs.index)) print(len(fit["cell_type"])) cell_types = [] _cell_types = dict(zip(fit["Barcode"], fit["cell_type"])) for barcode in adata.obs.index: try: cell_types.append(_cell_types[barcode]) except KeyError as e: cell_types.append("Other") adata.obs["Cell Type"] = cell_types print(len(cell_types)) marker_genes = list(set(marker_genes).intersection(set(adata.var.index))) print(len(marker_genes)) print(marker_genes) sc.pl.dotplot(adata, marker_genes, groupby='Cell Type', save="matrix.png") sc.pl.stacked_violin(adata, marker_genes, groupby='Cell Type', rotation=90, save="vin_stacked.png") return ["dot_plot.png", "stacked_violinvin_stacked.png"]