def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init totalRatesFwd = [0] * 25 totalRatesRev = [0] * 25 tcCount = [0] * 100 # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: # Compute rates for current read rates = read.conversionRates # Get T -> C conversions for current read tc = read.tcCount tcCount[tc] += 1 # Add rates from read to total rates if (read.direction == ReadDirection.Reverse): totalRatesRev = sumLists(totalRatesRev, rates) else: totalRatesFwd = sumLists(totalRatesFwd, rates) # Print rates in correct format for plotting fo = open(outputCSV, "w") print("# slamdunk rates v" + __version__, file=fo) printRates(totalRatesFwd, totalRatesRev, fo) fo.close() if (not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: #f = tempfile.NamedTemporaryFile(delete=False) #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f) #f.close() callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def statsComputeOverallRates(referenceFile, bam, minBaseQual, outputCSV, outputPDF, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing overall rates for file " + bam, file=log) else: # Init totalRatesFwd = [0] * 25 totalRatesRev = [0] * 25 tcCount = [0] * 100 # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: # Compute rates for current read rates = read.conversionRates # Get T -> C conversions for current read tc = read.tcCount tcCount[tc] += 1 # Add rates from read to total rates if(read.direction == ReadDirection.Reverse): totalRatesRev = sumLists(totalRatesRev, rates) else: totalRatesFwd = sumLists(totalRatesFwd, rates) # Print rates in correct format for plotting fo = open(outputCSV, "w") print("# slamdunk rates v" + __version__, file=fo) printRates(totalRatesFwd, totalRatesRev, fo) fo.close() if(not checkStep([bam, referenceFile], [outputPDF], force)): print("Skipped computing overall rate pdfs for file " + bam, file=log) else: #f = tempfile.NamedTemporaryFile(delete=False) #print(removeExtension(basename(bam)), outputCSV, sep='\t', file=f) #f.close() callR(getPlotter("compute_overall_rates") + " -f " + outputCSV + " -n " + removeExtension(os.path.basename(bam)) + " -O " + outputPDF, log, dry=printOnly, verbose=verbose)
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep = "\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list(bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep = "\t", file=outFile) print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
def genomewideReadSeparation(referenceFile, snpsFile, bam, minBaseQual, outputBAMPrefix, conversionThreshold, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) samFile = pysam.AlignmentFile(bam, "rb") chromosomes = testFile.getChromosomes() backgroundReadFileName = outputBAMPrefix + "_backgroundReads.bam" tcReadFileName = outputBAMPrefix + "_TCReads.bam" backgroundReadFile = pysam.AlignmentFile(backgroundReadFileName, "wb", template=samFile) tcReadFile = pysam.AlignmentFile(tcReadFileName, "wb", template=samFile) tcReadDict = dict() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (read.isTcRead): tcReadDict[read.name] = 0 for read in samFile.fetch(): if read.query_name in tcReadDict: tcReadFile.write(read) else: backgroundReadFile.write(read) backgroundReadFile.close() tcReadFile.close() pysamIndex(backgroundReadFileName) pysamIndex(tcReadFileName)
def dumpReadInfo(referenceFile, bam, minQual, outputCSV, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: snps = SNPtools.SNPDictionary(snpsFile) snps.read() outputFile = SlamSeqWriter(outputCSV) #Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome) for read in readIterator: outputFile.write(read) outputFile.close()
def genomewideConversionRates(referenceFile, snpsFile, bam, minBaseQual, outputBedGraphPrefix, conversionThreshold, coverageCutoff, log): ref = pysam.FastaFile(referenceFile) snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() bedGraphInfo = re.sub("_slamdunk_mapped.*", "", basename(outputBedGraphPrefix)) print(bedGraphInfo) fileBedGraphRatesPlus = open( outputBedGraphPrefix + "_TC_rates_genomewide.bedGraph", 'w') fileBedGraphRatesMinus = open( outputBedGraphPrefix + "_AG_rates_genomewide.bedGraph", 'w') fileBedGraphCoveragePlus = open( outputBedGraphPrefix + "_coverage_plus_genomewide.bedGraph", 'w') fileBedGraphCoverageMinus = open( outputBedGraphPrefix + "_coverage_minus_genomewide.bedGraph", 'w') fileBedGraphTCConversions = open( outputBedGraphPrefix + "_TC_conversions_genomewide.bedGraph", 'w') fileBedGraphAGConversions = open( outputBedGraphPrefix + "_AG_conversions_genomewide.bedGraph", 'w') fileBedGraphT = open( outputBedGraphPrefix + "_coverage_T_genomewide.bedGraph", 'w') fileBedGraphA = open( outputBedGraphPrefix + "_coverage_A_genomewide.bedGraph", 'w') print( "track type=bedGraph name=\"" + bedGraphInfo + " tc-conversions\" description=\"# T->C conversions / # reads on T per position genome-wide\"", file=fileBedGraphRatesPlus) print( "track type=bedGraph name=\"" + bedGraphInfo + " ag-conversions\" description=\"# A->G conversions / # reads on A per position genome-wide\"", file=fileBedGraphRatesMinus) print( "track type=bedGraph name=\"" + bedGraphInfo + " plus-strand coverage\" description=\"# Reads on plus strand genome-wide\"", file=fileBedGraphCoveragePlus) print( "track type=bedGraph name=\"" + bedGraphInfo + " minus-strand coverage\" description=\"# Reads on minus strand genome-wide\"", file=fileBedGraphCoverageMinus) print( "track type=bedGraph name=\"" + bedGraphInfo + " T->C conversions\" description=\"# T->C conversions on plus strand genome-wide\"", file=fileBedGraphTCConversions) print( "track type=bedGraph name=\"" + bedGraphInfo + " A->G conversions\" description=\"# A->G conversions on minus strand genome-wide\"", file=fileBedGraphAGConversions) print( "track type=bedGraph name=\"" + bedGraphInfo + " T-coverage\" description=\"# Plus-strand reads on Ts genome-wide\"", file=fileBedGraphT) print( "track type=bedGraph name=\"" + bedGraphInfo + " A-coverage\" description=\"# Minus-strand reads on As genome-wide\"", file=fileBedGraphA) for chromosome in chromosomes: chrLength = testFile.getChromosomeLength(chromosome) tcCount = [0] * chrLength agCount = [0] * chrLength coveragePlus = [0] * chrLength coverageMinus = [0] * chrLength tCoverage = [0] * chrLength aCoverage = [0] * chrLength readIterator = testFile.readsInChromosome(chromosome, minBaseQual, conversionThreshold) for read in readIterator: if (not read.isTcRead): read.tcCount = 0 read.mismatches = [] read.conversionRates = 0.0 read.tcRate = 0.0 for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse) and mismatch.referencePosition >= 0 and mismatch.referencePosition < chrLength): if read.direction == ReadDirection.Reverse: agCount[mismatch.referencePosition] += 1 else: tcCount[mismatch.referencePosition] += 1 for i in xrange(read.startRefPos, read.endRefPos): if (i >= 0 and i < chrLength): if read.direction == ReadDirection.Reverse: coverageMinus[i] += 1 else: coveragePlus[i] += 1 prevCoveragePlus = 0 prevCoveragePlusPos = 0 prevCoverageMinus = 0 prevCoverageMinusPos = 0 prevTCConversionRate = 0 prevTCConversionRatePos = 0 prevAGConversionRate = 0 prevAGConversionRatePos = 0 prevTCConversions = 0 prevTCConversionPos = 0 prevAGConversions = 0 prevAGConversionPos = 0 prevTCoverage = 0 prevTCoveragePos = 0 prevACoverage = 0 prevACoveragePos = 0 for pos in xrange(0, chrLength): if prevCoveragePlus != coveragePlus[pos]: print(chromosome + "\t" + str(prevCoveragePlusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoveragePlus), file=fileBedGraphCoveragePlus) prevCoveragePlus = coveragePlus[pos] prevCoveragePlusPos = pos if prevCoverageMinus != coverageMinus[pos]: print(chromosome + "\t" + str(prevCoverageMinusPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevCoverageMinus), file=fileBedGraphCoverageMinus) prevCoverageMinus = coverageMinus[pos] prevCoverageMinusPos = pos tCoverage = 0 if coveragePlus[pos] > 0: base = ref.fetch(reference=chromosome, start=pos + 1, end=pos + 2) if base.upper() == "T": tCoverage = coveragePlus[pos] aCoverage = 0 if coverageMinus[pos] > 0: base = ref.fetch(reference=chromosome, start=pos + 1, end=pos + 2) if base.upper() == "A": aCoverage = coverageMinus[pos] if prevTCoverage != tCoverage: print(chromosome + "\t" + str(prevTCoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCoverage), file=fileBedGraphT) prevTCoverage = tCoverage prevTCoveragePos = pos if prevACoverage != aCoverage: print(chromosome + "\t" + str(prevACoveragePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevACoverage), file=fileBedGraphA) prevACoverage = aCoverage prevACoveragePos = pos if prevTCConversions != tcCount[pos]: print(chromosome + "\t" + str(prevTCConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversions), file=fileBedGraphTCConversions) prevTCConversions = tcCount[pos] prevTCConversionPos = pos if prevAGConversions != agCount[pos]: print(chromosome + "\t" + str(prevAGConversionPos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversions), file=fileBedGraphAGConversions) prevAGConversions = agCount[pos] prevAGConversionPos = pos TCconversionRate = 0 if coveragePlus[pos] > 0 and coveragePlus[pos] >= coverageCutoff: TCconversionRate = float(tcCount[pos]) / float( coveragePlus[pos]) AGconversionRate = 0 if coverageMinus[pos] > 0 and coverageMinus[pos] >= coverageCutoff: AGconversionRate = float(agCount[pos]) / float( coverageMinus[pos]) if prevTCConversionRate != TCconversionRate: print(chromosome + "\t" + str(prevTCConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevTCConversionRate), file=fileBedGraphRatesPlus) prevTCConversionRate = TCconversionRate prevTCConversionRatePos = pos if prevAGConversionRate != AGconversionRate: print(chromosome + "\t" + str(prevAGConversionRatePos + 1) + "\t" + str(pos + 1) + "\t" + str(prevAGConversionRate), file=fileBedGraphRatesMinus) prevAGConversionRate = AGconversionRate prevAGConversionRatePos = pos fileBedGraphRatesPlus.close() fileBedGraphRatesMinus.close() fileBedGraphCoveragePlus.close() fileBedGraphCoverageMinus.close() fileBedGraphTCConversions.close() fileBedGraphAGConversions.close() fileBedGraphT.close() fileBedGraphA.close()
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if (not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if (mismatch.isTCMismatch( read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else: mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if (read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if (not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput): print("Run " + bam) # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, None) chromosomes = testFile.getChromosomes() bedTree = bedToIntervallTree(bed) #evalHist = [0] * outFile = open(outputFile, "w") print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep="\t", file=outFile) total = 0 correct = 0 correcPosWrongTC = 0 wrongPos = 0 minBaseQual = 0 for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minBaseQual) for read in readIterator: total += 1 simInfo = read.name.split("_") utrSim = simInfo[0] tcCountSim = int(simInfo[2]) utrFound = None if read.chromosome in bedTree: overlaps = list( bedTree[read.chromosome][read.startRefPos:read.endRefPos]) if len(overlaps) > 0: utrFound = overlaps[0].data if utrFound == utrSim: if tcCountSim == read.tcCount: correct += 1 else: correcPosWrongTC += 1 else: wrongPos += 1 print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep="\t", file=outFile) #print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total) print(correct, correcPosWrongTC, wrongPos, total)
def tcPerReadPos(referenceFile, bam, minQual, maxReadLength, outputCSV, outputPDF, snpsFile, log, printOnly=False, verbose=True, force=False): if(not checkStep([bam, referenceFile], [outputCSV], force)): print("Skipped computing T->C per reads position for file " + bam, file=log) else: totalReadCountFwd = [0] * maxReadLength totalReadCountRev = [0] * maxReadLength tcPerPosRev = [0] * maxReadLength tcPerPosFwd = [0] * maxReadLength allPerPosRev = [0] * maxReadLength allPerPosFwd = [0] * maxReadLength snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other testFile = SlamSeqBamFile(bam, referenceFile, snps) chromosomes = testFile.getChromosomes() for chromosome in chromosomes: readIterator = testFile.readsInChromosome(chromosome, minQual) for read in readIterator: tcCounts = [0] * maxReadLength mutCounts = [0] * maxReadLength for mismatch in read.mismatches: if(mismatch.isTCMismatch(read.direction == ReadDirection.Reverse)): tcCounts[mismatch.readPosition] += 1 else : mutCounts[mismatch.readPosition] += 1 query_length = len(read.sequence) if(read.direction == ReadDirection.Reverse): tcPerPosRev = sumLists(tcPerPosRev, tcCounts) allPerPosRev = sumLists(allPerPosRev, mutCounts) for i in range(0, query_length): totalReadCountRev[i] += 1 else: tcPerPosFwd = sumLists(tcPerPosFwd, tcCounts) allPerPosFwd = sumLists(allPerPosFwd, mutCounts) for i in range(0, query_length): totalReadCountFwd[i] += 1 foTC = open(outputCSV, "w") print("# slamdunk tcperreadpos v" + __version__, file=foTC) for i in range(0, maxReadLength): print(allPerPosFwd[i], allPerPosRev[i], tcPerPosFwd[i], tcPerPosRev[i], totalReadCountFwd[i], totalReadCountRev[i], sep='\t', file=foTC) foTC.close() if(not checkStep([outputCSV], [outputPDF], force)): print("Skipped computing T->C per reads position plot for file " + bam, file=log) else: callR(getPlotter("conversion_per_read_position") + " -i " + outputCSV + " -o " + outputPDF, log, dry=printOnly, verbose=verbose)