def identifyLowConfidenceTranscript(infile, outfile):
    '''
    identify the transcripts which cannot be confidently quantified
    these fall into two categories:

    1. Transcripts whose with poor accuracy of estimated counts

       - transcripts with >2 absolute fold difference between the
         sum of ground truths and the sum of estimated counts are
         flagged

    2. Transcripts with poor correlation between estimated counts

       - spline fitted to relationship between correlation and kmer fraction.
         cut-off of 0.9 used to define minimum kmer fraction threshold.
         transcripts below threshold are flagged

    2. is not yet implemented. Currently the minimum kmer fraction is
    hardcoded as 0.03. Need to implement automatic threshold
    generation from data
    '''

    job_memory = "2G"

    TranscriptDiffExpression.identifyLowConfidenceTranscripts(
        infile, outfile, submit=True)
def runSleuth(infiles, outfiles):
    ''' run Sleuth to perform differential testing '''

    design, transcripts = infiles
    outfile, counts, tpm = outfiles

    Design = Expression.ExperimentalDesign(design)
    number_samples = sum(Design.table['include'])

    number_transcripts = 0
    with IOTools.openFile(transcripts, "r") as inf:
        for line in inf:
            if line.startswith(">"):
                number_transcripts += 1

    # TS: rough estimate is 24 bytes * bootstraps * samples * transcripts
    # (https://groups.google.com/forum/#!topic/kallisto-sleuth-users/mp064J-DRfI)
    # I've found this to be a serious underestimate so this is a more
    # conservative estimate
    memory_estimate = (48 * PARAMS["kallisto_bootstrap"] * number_samples *
                       number_transcripts)
    job_memory = "%fG" % ((memory_estimate / 1073741824))

    design_id = P.snip(design, ".design.tsv")
    model = PARAMS["sleuth_model_%s" % design_id]

    contrasts = PARAMS["sleuth_contrasts_%s" % design_id].split(",")

    for contrast in contrasts:

        TranscriptDiffExpression.runSleuth(
            design, "quant.dir/kallisto", model, contrast,
            outfile, counts, tpm, PARAMS["sleuth_fdr"],
            submit=True, job_memory=job_memory)
def expressionSummaryPlots(infiles, logfile):
    ''' make summary plots for expression values for each design file'''

    counts_inf, design_inf = infiles

    job_memory = "4G"

    TranscriptDiffExpression.makeExpressionSummaryPlots(
        counts_inf, design_inf, logfile, submit=True, job_memory=job_memory)