Ejemplo n.º 1
0
def Search(sampleid):
    tenxs = []
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    analysis_path = tenx.tenx_path
    print(analysis_path)
    tenx_analysis = TenxAnalysis(analysis_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    qc = QualityControl(tenx_analysis, sampleid)
    tenxs.append(tenx_analysis.adata(qc.sce))
    print ("Loading main sce {}".format(sampleid))
    sys.stdout.flush()
    samples = glob.glob("../../*/runs/.cache/*/metrics_summary.csv")
    for sample in samples:
        print ("Loading project sample {}".format(sample))
        sys.stdout.flush()
        sample_rel_path = os.path.split(sample)[0]
        sid = sample_rel_path.split("/")[-1]
        sidsce = os.path.join(sample_rel_path,"{0}.rdata".format(sid))
        if not os.path.exists(sidsce):
            print("Not found",sidsce)
            continue
        tenx_analysis = TenxAnalysis(sample_rel_path)
        tenx_analysis.load()
        tenx_analysis.extract()
        tenxs.append(tenx_analysis.adata(sidsce))
    print ("Finished project tree search.")
    sys.stdout.flush()
    return tenxs
Ejemplo n.º 2
0
 def get_tenx(samples):
     tenxs = []
     for sample in samples:
         tenx = TenxDataStorage(sample)
         tenx.download()
         tenxs.append(TenxAnalysis(tenx.tenx_path))
     return tenxs
Ejemplo n.º 3
0
def upload_tenx(sampleid, before, finished):
    print("Calling upload.")
    tenx = TenxAnalysis("./{}/outs/".format(sampleid))
    tenx.finalize()
    tenxds = TenxDataStorage(sampleid)
    tenxds.upload_cellranger(tenx)
    open(finished, "w").write("Completed")
Ejemplo n.º 4
0
def Run(sampleid, before, finished):
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    tenx_analysis = TenxAnalysis(tenx.tenx_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    qc = QualityControl(tenx_analysis,sampleid)
    plots = qc.plots
    cellassign = os.path.join(os.path.split(plots)[0],"cellassignanalysis")
    results = Results(config.jobpath)

    results.add_analysis(tenx.tenx_path)
    results.add_sce(qc.qcdsce)

    umi = os.path.join(plots,"umi.png")
    mito = os.path.join(plots,"mito.png")
    ribo = os.path.join(plots, "ribo.png")
    total_counts = os.path.join(plots, "total_counts.png")
    tfbc = os.path.join(plots, "total_features_by_counts.png")
    tcvfc = os.path.join(plots, "total_counts_v_features_by_counts.png")
    celltypes = os.path.join(cellassign, "cell_types.png")

    results.add_plot(umi,"UMI Distribution")
    results.add_plot(mito,"Mito Distribution")
    results.add_plot(ribo,"Ribo Distribution")
    results.add_plot(total_counts,"Total Counts Distribution")
    results.add_plot(tcvfc,"Total Counts")
    results.add_plot(tcvfc,"Total Features by Counts")
    results.add_plot(celltypes,"Cell Types")

    exportMD(results)
    exportUpload(results)
    open(finished,"w").write("Completed")
Ejemplo n.º 5
0
def Analysis(sampleid, before, finished):
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    analysis_path = tenx.tenx_path
    tenx_analysis = TenxAnalysis(analysis_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    qc = QualityControl(tenx_analysis, sampleid)
    cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid)
    if not os.path.exists(cellassign_analysis):
        os.makedirs(cellassign_analysis)
    pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid))
    assert os.path.exists(pyfit), "No Pyfit Found."
    pyfit = pickle.load(open(pyfit, "rb"))
    marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix)
    cell_types = marker_list.celltypes()
    if "B cell" not in cell_types: cell_types.append("B cell")
    celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types)
    tsne_by_cell_type(qc.sce,
                      pyfit,
                      sampleid,
                      cellassign_analysis,
                      known_types=cell_types)
    umap_by_cell_type(qc.sce,
                      pyfit,
                      sampleid,
                      cellassign_analysis,
                      known_types=cell_types)
    open(finished, "w").write("Completed")
Ejemplo n.º 6
0
def Run(sampleid, finished):
    if not os.path.exists("cellranger.complete"):
        CellRanger.count([sampleid])
        tenx = TenxAnalysis("./{}/outs/".format(sampleid))
        tenx.finalize()
        tenxds = TenxDataStorage(sampleid)
        tenxds.upload_cellranger(tenx)
    open(finished,"w").write("Completed")
Ejemplo n.º 7
0
def RunUpload(sampleid, finished, species):
    print("Uploading ",species, sampleid)
    tenx_output = os.path.join(config.jobpath,"{}/outs/".format(sampleid))
    tenx = TenxAnalysis(tenx_output)
    tenx.finalize()
    tenxds = TenxDataStorage(sampleid, species=species)
    print("Running upload")
    tenxds.upload_cellranger(tenx)
    open(finished,"w").write("Completed")
Ejemplo n.º 8
0
def Run(sampleid, before, finished):
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    analysis_path = tenx.tenx_path
    tenx_analysis = TenxAnalysis(analysis_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    qc = QualityControl(tenx_analysis, sampleid)
    CellAssign.run(qc.sce, config.rho_matrix,
                   ".cache/{}/celltypes.rdata".format(sampleid))
    open(finished, "w").write("Completed")
Ejemplo n.º 9
0
 def __init__(self, sampleids, chem="v2", output="./"):
     self.output = output
     self.samples = sampleids
     self.tenxs = []
     for sampleid in self.samples:
         tenx = TenxDataStorage(sampleid, version=chem)
         tenx.download()
         tenx_analysis = TenxAnalysis(tenx.tenx_path)
         tenx_analysis.load()
         tenx_analysis.extract()
         self.tenxs.append(tenx_analysis)
Ejemplo n.º 10
0
def main():
    sample = "patient2"

    tenx = TenxDataStorage(sample, version="v2")
    tenx.download()
    tenx_analysis = TenxAnalysis(tenx.tenx_path)
    tenx_analysis.load()
    output = "/igo_large/scratch/test_kallisto"
    fastq_directory = FastQDirectory(
        "/igo_large/scratch/allen/bams/xfastqs2/McGilvery_Sonya__TLH_MissingLibrary_1_CB8R9ANXX/",
        sample, output)

    krunner = Kallisto(fastq_directory, tenx_analysis)
    krunner.de()
Ejemplo n.º 11
0
def Run(sampleid, before, finished):
    print("Running QC.")
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    tenx_analysis = TenxAnalysis(tenx.tenx_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    print("Extracted.")
    qc = QualityControl(tenx_analysis, sampleid)
    qc.run(mito=config.mito)
    print("Uploading")
    qc.upload_raw()
    qc.upload()
    open(finished, "w").write("Completed")
Ejemplo n.º 12
0
def Run(sampleid, before, finished, use_corrected=False):
    if use_corrected and os.path.exists(".cache/corrected/"):
        sce = ".cache/corrected/corrected_sce.rdata"
        if not os.path.exists(sce):
            utils = DropletUtils()
            utils.read10xCounts(".cache/corrected/",
                                ".cache/corrected/corrected_sce.rdata")
    else:
        tenx = TenxDataStorage(sampleid, version="v3")
        tenx.download()
        analysis_path = tenx.tenx_path
        tenx_analysis = TenxAnalysis(analysis_path)
        tenx_analysis.load()
        tenx_analysis.extract()
        qc = QualityControl(tenx_analysis, sampleid)
        sce = qc.sce
    if not os.path.exists(".cache/{}/celltypes.rdata".format(sampleid)):
        CellAssign.run(sce, config.rho_matrix,
                       ".cache/{}/celltypes.rdata".format(sampleid))
    open(finished, "w").write("Completed")
Ejemplo n.º 13
0
def Run(sampleid, before, finished):
    clustering = ".cache/{}/clustering/".format(sampleid)
    if not os.path.exists(clustering):
        os.makedirs(clustering)
    cluster_results = os.path.join(clustering,
                                   "{}_clusters.pkl".format(sampleid))
    tenx = TenxDataStorage(sampleid, version="v3")
    tenx.download()
    analysis_path = tenx.tenx_path
    tenx_analysis = TenxAnalysis(analysis_path)
    tenx_analysis.load()
    tenx_analysis.extract()
    qc = QualityControl(tenx_analysis, sampleid)
    if not os.path.exists(cluster_results):
        clusters = tenx_analysis.clusters(qc.sce)
        pickle.dump(clusters, open(cluster_results, "wb"))
    else:
        clusters = pickle.load(open(cluster_results, "rb"))
    tsne_by_cluster(qc.sce, clusters, sampleid, clustering)
    umap_by_cluster(qc.sce, clusters, sampleid, clustering)
    open(finished, "w").write("Completed")
Ejemplo n.º 14
0
def Analysis(sampleid, before, finished, use_corrected=False):
    if use_corrected and os.path.exists(".cache/corrected"):
        sce = ".cache/corrected/corrected_sce.rdata"
        if not os.path.exists(sce):
            utils = DropletUtils()
            utils.read10xCounts(".cache/corrected/",
                                ".cache/corrected/corrected_sce.rdata")
        filtered_sce = sce
    else:
        tenx = TenxDataStorage(sampleid, version="v3")
        tenx.download()
        analysis_path = tenx.tenx_path
        tenx_analysis = TenxAnalysis(analysis_path)
        tenx_analysis.load()
        tenx_analysis.extract()
        qc = QualityControl(tenx_analysis, sampleid)
        filtered_sce = os.path.join(os.path.split(qc.sce)[0], "sce_cas.rdata")
    cellassign_analysis = ".cache/{}/cellassignanalysis/".format(sampleid)
    if not os.path.exists(cellassign_analysis):
        os.makedirs(cellassign_analysis)
    pyfit = os.path.join(".cache/{}/cell_types.pkl".format(sampleid))
    assert os.path.exists(pyfit), "No Pyfit Found."
    pyfit = pickle.load(open(pyfit, "rb"))
    marker_list = GeneMarkerMatrix.read_yaml(config.rho_matrix)
    cell_types = marker_list.celltypes()
    if "B cell" not in cell_types: cell_types.append("B cell")
    celltypes(pyfit, sampleid, cellassign_analysis, known_types=cell_types)

    tsne_by_cell_type(filtered_sce,
                      pyfit,
                      sampleid,
                      cellassign_analysis,
                      known_types=cell_types)
    umap_by_cell_type(filtered_sce,
                      pyfit,
                      sampleid,
                      cellassign_analysis,
                      known_types=cell_types)
    open(finished, "w").write("Completed")
Ejemplo n.º 15
0
def RunDownload(sampleids, finished):
    for i, sample in enumerate(sampleids):
        tenx = TenxDataStorage(sample)
        path = tenx.download()
        path_json = {sample: path}
        open(finished(i), "w").write(json.dumps(path_json))
Ejemplo n.º 16
0
 def run_transcript(self, fastqs=[]):
     matrices = dict()
     assert len(fastqs) == len(
         self.samples), "Provide fastq object for each sample."
     for sampleid, fastq in zip(self.samples, self.fastqs):
         tenx = TenxDataStorage(sampleid, version="v2")
         tenx.download()
         tenx_analysis = TenxAnalysis(tenx.tenx_path)
         tenx_analysis.load()
         tenx_analysis.extract()
         self.krunner = Kallisto(fastqs, tenx_analysis, chem=chem)
         self.krunner.run_pseudo()
         self.krunner.run_bus()
         matrix = self.krunner.design_matrix()
         matrices[sampleid] = matrix
     self.matrices = matrices
     self.matrix1 = self.matrices[sampleids[0]]
     self.matrix2 = self.matrices[sampleids[1]]
     self.common_genes = set(self.matrix1.keys()).intersection(
         set(self.matrix2.keys()))
     self.model = LogisticRegression(random_state=0,
                                     solver='lbfgs',
                                     multi_class='multinomial')
     de_file = "{}_{}_de.tsv".format(self.samples[0], self.samples[1])
     if not os.path.exists(de_file):
         return
         output = open(
             "{}_{}_de.tsv".format(self.samples[0], self.samples[1]), "w")
         output.write("Gene\tPValue\n")
         differential_genes = dict()
         for gene in tqdm.tqdm(self.common_genes):
             tcc_common = set(self.matrix1[gene].keys()).intersection(
                 set(self.matrix2[gene].keys()))
             if len(tcc_common) == 0:
                 continue
             Y = []
             X = []
             cells1 = list(
                 itertools.chain.from_iterable([
                     list(self.matrix1[gene][tcc].keys())
                     for tcc in tcc_common
                 ]))
             cells2 = list(
                 itertools.chain.from_iterable([
                     list(self.matrix2[gene][tcc].keys())
                     for tcc in tcc_common
                 ]))
             if len(cells1) == 0 or len(cells2) == 0:
                 continue
             for cell in cells1:
                 Y.append(self.samples[0])
                 predictors = []
                 for tcc in tcc_common:
                     try:
                         predictors.append(self.matrix1[gene][tcc][cell])
                     except KeyError:
                         predictors.append(0)
                 X.append(predictors)
             for cell in cells2:
                 Y.append(self.samples[1])
                 predictors = []
                 for tcc in tcc_common:
                     try:
                         predictors.append(self.matrix2[gene][tcc][cell])
                     except KeyError:
                         predictors.append(0)
                 X.append(predictors)
             classes = set(Y)
             Y = numpy.array(Y)
             X = numpy.array(X)
             if Y.shape[0] < 2 or len(classes) == 1:
                 continue
             self.model.fit(X, Y)
             null_prob = 2.0 / float(Y.shape[0]) * numpy.ones(Y.shape)
             df = X.shape[1]
             alt_prob = self.model.predict_proba(X)
             alt_log_likelihood = -log_loss(Y, alt_prob, normalize=False)
             null_log_likelihood = -log_loss(Y, null_prob, normalize=False)
             G = 2 * (alt_log_likelihood - null_log_likelihood)
             p_value = chi2.sf(G, df)
             differential_genes[gene] = p_value
             output.write("{}\t{}\n".format(gene, p_value))
         sorted_genes = sorted(differential_genes.items(),
                               key=operator.itemgetter(1))
         print("**************** Differential Genes ********************")
         for gene, pvalue in sorted_genes[:100]:
             print(gene, pvalue)
         output.close()
     else:
         differential_genes = dict()
         differential_genes_adj = dict()
         genes = open(de_file, "r").read().splitlines()
         genes.pop(0)
         _genes = []
         pvalues = []
         adjpvalues = []
         for gene in genes:
             gene, pvalue = gene.split()
             differential_genes[gene] = float(pvalue)
             pvalues.append(float(pvalue))
             _genes.append(gene)
         adj_pvalues = list(multitest.multipletests(pvalues)[1])
         print(adj_pvalues)
         for gene, pvalue, adjp in zip(_genes, pvalues, adj_pvalues):
             differential_genes_adj[gene] = adjp
         sorted_genes = sorted(differential_genes_adj.items(),
                               key=operator.itemgetter(1))
         thresholds = (0.05, 0.01, 0.001)
         import collections
         sig_genes = collections.defaultdict(list)
         for gene, pvalue in sorted_genes:
             for threshold in thresholds:
                 if pvalue < threshold:
                     sig_genes[str(threshold)].append(gene)
         print("**************** Differential Genes ********************")
         for thresh, sig_genes in sig_genes.items():
             print(thresh, len(sig_genes))
         for gene, pvalue in sorted_genes[:100]:
             print(gene, pvalue)
     return sorted_genes
Ejemplo n.º 17
0
    def load(self):
        if not os.path.exists(self.directory):
            self.sample = self.directory
            if not os.path.exists(".cache/{}".format(self.sample)):
                cloud_storage = TenxDataStorage(self.sample)
                self.directory = cloud_storage.download()
            else:
                self.directory = ".cache/{}".format(self.sample)
        self.path = self.directory
        v3_path_raw = self.raw_gene_bc_matrices = os.path.join(
            self.path, 'raw_feature_bc_matrix')
        v2_path_raw = self.raw_gene_bc_matrices = os.path.join(
            self.path, 'raw_gene_bc_matrices')
        if os.path.exists(v3_path_raw):
            self.raw_gene_bc_matrices = v3_path_raw
            self.detected_version = "v3"
        elif os.path.exists(v2_path_raw):
            self.raw_gene_bc_matrices = v2_path_raw
            self.detected_version = "v2"
        elif os.path.exists(v3_path_raw + "_mex"):
            self.raw_gene_bc_matrices = v3_path_raw + "_mex"
            self.detected_version = "v3"
        elif os.path.exists(v2_path_raw + "_mex"):
            self.raw_gene_bc_matrices = v2_path_raw + "_mex"
            self.detected_version = "v2"
        else:
            print(
                "No Raw Matrices folder found -- Check dir name (raw_feature_bc_matrix or raw_gene_bc_matrices)"
            )
        v3_path_filtered = os.path.join(self.path,
                                        'filtered_feature_bc_matrix')
        v2_path_filtered = os.path.join(self.path, 'filtered_gene_bc_matrices')
        if os.path.exists(v3_path_filtered):
            self.filtered_gene_bc_matrices = v3_path_filtered
            self.detected_version = "v3"
        elif os.path.exists(v2_path_filtered):
            self.filtered_gene_bc_matrices = v2_path_filtered
            self.detected_version = "v2"
        elif os.path.exists(v3_path_filtered + "_mex"):
            self.filtered_gene_bc_matrices = v3_path_filtered + "_mex"
            self.detected_version = "v3"
        elif os.path.exists(v2_path_filtered + "_mex"):
            self.filtered_gene_bc_matrices = v2_path_filtered + "_mex"
            self.detected_version = "v2"
        else:
            print(
                "No Filtered Matrices folder found -- Check dir name (filtered_feature_bc_matrix or filtered_gene_bc_matrices)"
            )
        self.clustering = os.path.join(self.path, 'analysis/clustering')
        self.matrix = os.path.join(self.path, "")
        self.projection = os.path.join(
            self.path, 'analysis/pca/10_components/projection.csv')
        self.cellranger_tsne = os.path.join(
            self.path, 'analysis/tsne/2_components/projection.csv')
        self.summary = os.path.join(self.path, "web_summary.html")
        self.metrics_summary = os.path.join(self.path, "metrics_summary.csv")
        self.top_level = "/".join(self.path.split("/")[:-3])

        self.baseobj = "sce.rdata"
        self.qcdobj = "qcdsce.rdata"
        self.rdata = os.path.join(self.directory, self.baseobj)
        self.qcdrdata = os.path.join(self.directory, self.qcdobj)