Beispiel #1
0
def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False):
     
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        totalRatesFwd = [0] * 25
        totalRatesRev = [0] * 25
        tcCount = [0] * 100
         
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)
         
        chromosomes = testFile.getChromosomes()
         
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minBaseQual)
                 
            for read in readIterator:
                 
                # Compute rates for current read
                rates = read.conversionRates
                # Get T -> C conversions for current read
                tc = read.tcCount
                tcCount[tc] += 1
                 
                # Add rates from read to total rates
                if(read.direction == ReadDirection.Reverse):
                    totalRatesRev = sumLists(totalRatesRev, rates)
                else:
                    totalRatesFwd = sumLists(totalRatesFwd, rates)
              
        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")
        print("# slamdunk rates v" + __version__, file=fo)
        printRates(totalRatesFwd, totalRatesRev, fo)
        fo.close()
     
    if(not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:

        #f = tempfile.NamedTemporaryFile(delete=False)
        #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f)
        #f.close()
             
        callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
Beispiel #2
0
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput):

    print("Run " + bam)

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, None)
     
    chromosomes = testFile.getChromosomes()
    
    bedTree = bedToIntervallTree(bed)
    #evalHist = [0] *  
    
    outFile = open(outputFile, "w")
    print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep = "\t", file=outFile)
    
    total = 0
    correct = 0
    correcPosWrongTC = 0
    wrongPos = 0
    
    minBaseQual = 0
    for chromosome in chromosomes:
        readIterator = testFile.readsInChromosome(chromosome, minBaseQual)
        
        for read in readIterator:
            total += 1
            simInfo = read.name.split("_")
            utrSim = simInfo[0]
            tcCountSim = int(simInfo[2])
            
            utrFound = None
            if read.chromosome in bedTree:
                overlaps = list(bedTree[read.chromosome][read.startRefPos:read.endRefPos])
                if len(overlaps) > 0:
                    utrFound = overlaps[0].data
            
            if utrFound == utrSim:
                if tcCountSim == read.tcCount:
                    correct += 1
                else:
                    correcPosWrongTC += 1
            else:
                wrongPos += 1 
            
            print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep = "\t", file=outFile)
    
    print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
Beispiel #3
0
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False):
    
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam, file=log)
    else:
                
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
    
        outputFile = SlamSeqWriter(outputCSV)
        
        #Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)
        
        chromosomes = testFile.getChromosomes()
        
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome)
            for read in readIterator:
                outputFile.write(read)

        
        outputFile.close()
Beispiel #4
0
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False):
    
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing T->C per reads position for file " + bam, file=log)
    else:
        
        totalReadCountFwd = [0] * maxReadLength
        totalReadCountRev = [0] * maxReadLength
        
        tcPerPosRev = [0] * maxReadLength
        tcPerPosFwd = [0] * maxReadLength
        
        allPerPosRev = [0] * maxReadLength
        allPerPosFwd = [0] * maxReadLength

        
        snps = SNPtools.SNPDictionary(snpsFile)
        snps.read()
        
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, snps)
        
        chromosomes = testFile.getChromosomes()
        
        for chromosome in chromosomes:
            readIterator = testFile.readsInChromosome(chromosome, minQual)
                
            for read in readIterator:
                
                tcCounts = [0] * maxReadLength
                mutCounts = [0] * maxReadLength
                
                for mismatch in read.mismatches:
                    if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)):
                        tcCounts[mismatch.readPosition] += 1
                    else :
                        mutCounts[mismatch.readPosition] += 1
                        
                
                query_length = len(read.sequence)
                if(read.direction == ReadDirection.Reverse):
                    tcPerPosRev = sumLists(tcPerPosRev, tcCounts)
                    allPerPosRev = sumLists(allPerPosRev, mutCounts)
                    
                    for i in range(0, query_length):
                        totalReadCountRev[i] += 1
                else:
                    tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts)
                    allPerPosFwd = sumLists(allPerPosFwd, mutCounts)
                    
                    for i in range(0, query_length):
                        totalReadCountFwd[i] += 1
                        

        foTC = open(outputCSV, "w")
        
        print("# slamdunk tcperreadpos v" + __version__, file=foTC)
        
        for i in range(0, maxReadLength):
            print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC)
        foTC.close()
       
    if(not checkStep([outputCSV], [outputPDF], force)):
        print("Skipped computing T->C per reads position plot for file " + bam, file=log)
    else: 
        callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
Beispiel #5
0
def statsComputeTCContext(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False):
     
    if(not checkStep([bam, referenceFile], [outputCSV], force)):
        print("Skipped computing overall rates for file " + bam, file=log)
    else:
        # Init
        # combinations = ["AT","CT","GT","TT","NT","AA","CA","GA","TA","NA"]
        frontCombinations = ["AT", "CT", "GT", "TT", "NT"]
        backCombinations = ["TA", "TC", "TG", "TT", "TN"]
         
        counts = {}
        counts['5prime'] = {}
        counts['3prime'] = {}
        counts['5prime']['fwd'] = {}
        counts['5prime']['rev'] = {}
        counts['3prime']['fwd'] = {}
        counts['3prime']['rev'] = {}
         
        for combination in frontCombinations :
            counts['5prime']['fwd'][combination] = 0
            counts['5prime']['rev'][combination] = 0
             
        for combination in backCombinations:
            counts['3prime']['fwd'][combination] = 0
            counts['3prime']['rev'][combination] = 0
             
        bamFile = pysam.AlignmentFile(bam, "rb")
         
        # Go through one chr after the other
        testFile = SlamSeqBamFile(bam, referenceFile, None)
         
        chromosomes = testFile.getChromosomes()
         
        for chromosome in chromosomes:
                 
            for read in bamFile.fetch(region=chromosome):
                 
                i = 0
                while i < len(read.query_sequence):
                    if(read.query_sequence[i] == "T" and not read.is_reverse) :
                        frontContext = None
                        backContext = None
                        if (i > 0) :
                            frontContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)) :
                            backContext  = read.query_sequence[i + 1]
                         
                        if (frontContext != None) :
                            counts['5prime']['fwd'][frontContext + "T"] += 1
                        if (backContext != None) :
                            counts['3prime']['fwd']["T" + backContext] += 1
                             
                    if(read.query_sequence[i] == "A" and read.is_reverse) :
                        frontContext = None
                        backContext = None
                        if (i > 0) :
                            backContext = read.query_sequence[i - 1]
                        if (i < (len(read.query_sequence) - 1)) :
                            frontContext  = read.query_sequence[i + 1]
                         
                        if (frontContext != None) :
                            counts['5prime']['rev'][complement(frontContext + "A")] += 1
                        if (backContext != None) :
                            counts['3prime']['rev'][complement("A" + backContext)] += 1
                     
                    i += 1
         
        # Print rates in correct format for plotting
        fo = open(outputCSV, "w")
         
        print("\t".join(frontCombinations), file=fo)
         
        frontFwdLine = ""
        frontRevLine = ""
        backFwdLine = ""
        backRevLine = ""
         
        for combination in frontCombinations :
            frontFwdLine += str(counts['5prime']['fwd'][combination]) + "\t"
            frontRevLine += str(counts['5prime']['rev'][combination]) + "\t"
         
        print(frontFwdLine.rstrip(), file=fo)
        print(frontRevLine.rstrip(), file=fo)
         
        print("\t".join(backCombinations), file=fo)
 
        for combination in backCombinations :
            backFwdLine += str(counts['3prime']['fwd'][combination]) + "\t"
            backRevLine += str(counts['3prime']['rev'][combination]) + "\t"
 
        print(backFwdLine.rstrip(), file=fo)
        print(backRevLine.rstrip(), file=fo)
         
        fo.close()
     
    if(not checkStep([bam, referenceFile], [outputPDF], force)):
        print("Skipped computing overall rate pdfs for file " + bam, file=log)
    else:
        f = tempfile.NamedTemporaryFile(delete=False)
        print(removeExtension(os.path.basename(bam)), outputCSV, sep='\t', file=f)
        f.close()
         
        callR(getPlotter("compute_context_TC_rates") + " -f " + f.name + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
Beispiel #6
0
def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual, outputBedGraphPrefix, conversionThreshold, coverageCutoff, log):

    ref = pysam.FastaFile(referenceFile)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, snps)

    chromosomes = testFile.getChromosomes()

    bedGraphInfo = re.sub("_slamdunk_mapped.*","",basename(outputBedGraphPrefix))
    print(bedGraphInfo)

    fileBedGraphRatesPlus = open(outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w')
    fileBedGraphRatesMinus = open(outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w')
    fileBedGraphCoveragePlus = open(outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w')
    fileBedGraphCoverageMinus = open(outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w')
    fileBedGraphTCConversions = open(outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w')
    fileBedGraphAGConversions = open(outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w')
    fileBedGraphT = open(outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w')
    fileBedGraphA = open(outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w')

    print("track type=bedGraph name=\"" + bedGraphInfo + " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"", file=fileBedGraphRatesPlus)
    print("track type=bedGraph name=\"" + bedGraphInfo + " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"", file=fileBedGraphRatesMinus)
    print("track type=bedGraph name=\"" + bedGraphInfo + " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"", file=fileBedGraphCoveragePlus)
    print("track type=bedGraph name=\"" + bedGraphInfo + " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"", file=fileBedGraphCoverageMinus)
    print("track type=bedGraph name=\"" + bedGraphInfo + " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"", file=fileBedGraphTCConversions)
    print("track type=bedGraph name=\"" + bedGraphInfo + " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"", file=fileBedGraphAGConversions)
    print("track type=bedGraph name=\"" + bedGraphInfo + " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"", file=fileBedGraphT)
    print("track type=bedGraph name=\"" + bedGraphInfo + " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"", file=fileBedGraphA)

    for chromosome in chromosomes:

        chrLength = testFile.getChromosomeLength(chromosome)

        tcCount = [0] * chrLength
        agCount = [0] * chrLength

        coveragePlus = [0] * chrLength
        coverageMinus = [0] * chrLength

        tCoverage = [0] * chrLength
        aCoverage = [0] * chrLength

        readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold)

        for read in readIterator:
            if (not read.isTcRead) :
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            for mismatch in read.mismatches:
                if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        agCount[mismatch.referencePosition] += 1
                    else :
                        tcCount[mismatch.referencePosition] += 1

            for i in range(read.startRefPos, read.endRefPos):
                if(i >= 0 and i < chrLength):
                    if read.direction == ReadDirection.Reverse:
                        coverageMinus[i] += 1
                    else :
                        coveragePlus[i] += 1

        prevCoveragePlus = 0
        prevCoveragePlusPos = 0
        prevCoverageMinus = 0
        prevCoverageMinusPos = 0
        prevTCConversionRate = 0
        prevTCConversionRatePos = 0
        prevAGConversionRate = 0
        prevAGConversionRatePos = 0
        prevTCConversions = 0
        prevTCConversionPos = 0
        prevAGConversions = 0
        prevAGConversionPos = 0
        prevTCoverage = 0
        prevTCoveragePos = 0
        prevACoverage = 0
        prevACoveragePos = 0

        for pos in range(0, chrLength):
            if prevCoveragePlus != coveragePlus[pos]:
                print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoveragePlus), file = fileBedGraphCoveragePlus)
                prevCoveragePlus = coveragePlus[pos]
                prevCoveragePlusPos = pos
            if prevCoverageMinus != coverageMinus[pos]:
                print(chromosome + "\t" + str(prevCoverageMinusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus), file = fileBedGraphCoverageMinus)
                prevCoverageMinus = coverageMinus[pos]
                prevCoverageMinusPos = pos

            tCoverage = 0

            if coveragePlus[pos] > 0:
                base = ref.fetch(reference=chromosome, start = pos + 1, end = pos + 2)
                if base.upper() == "T":
                    tCoverage = coveragePlus[pos]

            aCoverage = 0

            if coverageMinus[pos] > 0:
                base = ref.fetch(reference=chromosome, start = pos + 1, end = pos + 2)
                if base.upper() == "A":
                    aCoverage = coverageMinus[pos]

            if prevTCoverage != tCoverage:
                print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCoverage), file = fileBedGraphT)
                prevTCoverage = tCoverage
                prevTCoveragePos = pos

            if prevACoverage != aCoverage:
                print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevACoverage), file = fileBedGraphA)
                prevACoverage = aCoverage
                prevACoveragePos = pos

            if prevTCConversions != tcCount[pos]:
                print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversions), file = fileBedGraphTCConversions)
                prevTCConversions = tcCount[pos]
                prevTCConversionPos = pos

            if prevAGConversions != agCount[pos]:
                print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversions), file = fileBedGraphAGConversions)
                prevAGConversions = agCount[pos]
                prevAGConversionPos = pos

            TCconversionRate = 0
            if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff:
                TCconversionRate = float(tcCount[pos]) / float(coveragePlus[pos])

            AGconversionRate = 0
            if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff:
                AGconversionRate = float(agCount[pos]) / float(coverageMinus[pos])

            if prevTCConversionRate != TCconversionRate:
                print(chromosome + "\t" + str(prevTCConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate), file = fileBedGraphRatesPlus)
                prevTCConversionRate = TCconversionRate
                prevTCConversionRatePos = pos

            if prevAGConversionRate != AGconversionRate:
                print(chromosome + "\t" + str(prevAGConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate), file = fileBedGraphRatesMinus)
                prevAGConversionRate = AGconversionRate
                prevAGConversionRatePos = pos

    fileBedGraphRatesPlus.close()
    fileBedGraphRatesMinus.close()
    fileBedGraphCoveragePlus.close()
    fileBedGraphCoverageMinus.close()
    fileBedGraphTCConversions.close()
    fileBedGraphAGConversions.close()
    fileBedGraphT.close()
    fileBedGraphA.close()
Beispiel #7
0
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput):

    print("Run " + bam)

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, None)

    chromosomes = testFile.getChromosomes()

    bedTree = bedToIntervallTree(bed)
    #evalHist = [0] *

    outFile = open(outputFile, "w")
    print("read.name",
          "read.chromosome",
          "read.startRefPos",
          "sim.utr",
          "read.utr",
          "sim.tcCount",
          "read.tcCount",
          sep="\t",
          file=outFile)

    total = 0
    correct = 0
    correcPosWrongTC = 0
    wrongPos = 0

    minBaseQual = 0
    for chromosome in chromosomes:
        readIterator = testFile.readsInChromosome(chromosome, minBaseQual)

        for read in readIterator:
            total += 1
            simInfo = read.name.split("_")
            utrSim = simInfo[0]
            tcCountSim = int(simInfo[2])

            utrFound = None
            if read.chromosome in bedTree:
                overlaps = list(
                    bedTree[read.chromosome][read.startRefPos:read.endRefPos])
                if len(overlaps) > 0:
                    utrFound = overlaps[0].data

            if utrFound == utrSim:
                if tcCountSim == read.tcCount:
                    correct += 1
                else:
                    correcPosWrongTC += 1
            else:
                wrongPos += 1

            print(read.name,
                  read.chromosome,
                  read.startRefPos,
                  utrSim,
                  utrFound,
                  tcCountSim,
                  read.tcCount,
                  sep="\t",
                  file=outFile)

    print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total,
          wrongPos * 100.0 / total, total)