def identifyLowConfidenceTranscript(infile, outfile): ''' identify the transcripts which cannot be confidently quantified these fall into two categories: 1. Transcripts whose with poor accuracy of estimated counts - transcripts with >2 absolute fold difference between the sum of ground truths and the sum of estimated counts are flagged 2. Transcripts with poor correlation between estimated counts - spline fitted to relationship between correlation and kmer fraction. cut-off of 0.9 used to define minimum kmer fraction threshold. transcripts below threshold are flagged 2. is not yet implemented. Currently the minimum kmer fraction is hardcoded as 0.03. Need to implement automatic threshold generation from data ''' job_memory = "2G" TranscriptDiffExpression.identifyLowConfidenceTranscripts( infile, outfile, submit=True)
def runSleuth(infiles, outfiles): ''' run Sleuth to perform differential testing ''' design, transcripts = infiles outfile, counts, tpm = outfiles Design = Expression.ExperimentalDesign(design) number_samples = sum(Design.table['include']) number_transcripts = 0 with IOTools.openFile(transcripts, "r") as inf: for line in inf: if line.startswith(">"): number_transcripts += 1 # TS: rough estimate is 24 bytes * bootstraps * samples * transcripts # (https://groups.google.com/forum/#!topic/kallisto-sleuth-users/mp064J-DRfI) # I've found this to be a serious underestimate so this is a more # conservative estimate memory_estimate = (48 * PARAMS["kallisto_bootstrap"] * number_samples * number_transcripts) job_memory = "%fG" % ((memory_estimate / 1073741824)) design_id = P.snip(design, ".design.tsv") model = PARAMS["sleuth_model_%s" % design_id] contrasts = PARAMS["sleuth_contrasts_%s" % design_id].split(",") for contrast in contrasts: TranscriptDiffExpression.runSleuth( design, "quant.dir/kallisto", model, contrast, outfile, counts, tpm, PARAMS["sleuth_fdr"], submit=True, job_memory=job_memory)
def expressionSummaryPlots(infiles, logfile): ''' make summary plots for expression values for each design file''' counts_inf, design_inf = infiles job_memory = "4G" TranscriptDiffExpression.makeExpressionSummaryPlots( counts_inf, design_inf, logfile, submit=True, job_memory=job_memory)