def evalHalfLifes(trueHLFile, simHLFile, predHLFile, outputPDF, erroutputCSV): run("Rscript " + pathEvalHalfLife + " -t " + trueHLFile + " -p " + predHLFile + " -s " + simHLFile + " -o " + outputPDF + " -m " + erroutputCSV, sys.stderr, dry=False, verbose=False)
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2" , outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False): if(quantseqMapping is True) : parameter = "--no-progress" if(trim5p > 0): parameter = parameter + " -5 " + str(trim5p) if(maxPolyA > -1): parameter = parameter + " --max-polya " + str(maxPolyA) if(endtoendMapping is True): parameter = parameter + " -e " else: parameter = parameter + " -l " if(sampleId != None): parameter = parameter + " --rg-id " + str(sampleId) if(sampleName != ""): parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str(sampleTime) if(topn > 1): parameter = parameter + " -n " + str(topn) + " --strata " if(checkStep([inputReference, inputBAM], [replaceExtension(outputSAM, ".bam")], force)): if outputSAM.endswith(".sam"): # Output SAM run(getBinary("ngm") + " -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) else: # Output BAM directly run(getBinary("ngm") + " -b -r " + inputReference + " -q " + inputBAM + " -t " + str(threads) + " " + parameter + " -o " + outputSAM, log, verbose=verbose, dry=printOnly) else: print("Skipped mapping for " + inputBAM, file=log)
def runSam2bam(inFile, outFile, log, index=True, sort=True, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False): if(delinFile and files_exist(outFile) and not files_exist(inFile)): print("Skipping sam2bam for " + outFile, file=log) else: if(onlyUnique and filterMQ == 0): filterMQ = 1; success = True cmd = [getBinary("samtools"), "view", "-@", str(threads), "-Sb", "-o", outFile, inFile] if filterMQ > 0: cmd+=["-q", str(filterMQ)] if onlyProperPaired: cmd+=["-f", "2"] if not L is None: cmd+=["-L", L] run(" ".join(cmd), log, verbose=verbose, dry=dry) if(sort): tmp = outFile + "_tmp" if(not dry): os.rename(outFile, tmp) run(" ".join([getBinary("samtools"), "sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry) if(success): removeFile(tmp) if(success and delinFile): if(not dry): removeFile(inFile) if(index): pysamIndex(outFile)
def plotconversiondifferences(simDir, slamDir, conversionRate, outputPDF): simFiles = sorted(glob.glob(simDir + "*_utrsummary.csv")) slamdunkFiles = sorted(glob.glob(slamDir + "*_reads_slamdunk_mapped_filtered_tcount.csv")) if(len(simFiles) == len(slamdunkFiles)): run("Rscript " + pathEvalConversionrates + " -c " + str(conversionRate) + " -s " + ",".join(simFiles) + " -f " + ",".join(slamdunkFiles) + " -o " + outputPDF, sys.stderr, dry=False, verbose=False) else: raise RuntimeError("Couldn't match files with timepoints")
def Map(inputBAM, inputReference, outputSAM, log, quantseqMapping, endtoendMapping, threads=1, parameter="--no-progress --slam-seq 2", outputSuffix="_ngm_slamdunk", trim5p=0, maxPolyA=-1, topn=1, sampleId=None, sampleName="NA", sampleType="NA", sampleTime=0, printOnly=False, verbose=True, force=False, isPaired=False): if quantseqMapping: parameter = "--no-progress" if trim5p > 0: parameter = parameter + " -5 " + str(trim5p) if maxPolyA > -1: parameter = parameter + " --max-polya " + str(maxPolyA) if endtoendMapping: parameter = parameter + " -e " else: parameter = parameter + " -l " if sampleId is not None: parameter = parameter + " --rg-id " + str(sampleId) if sampleName != "": parameter = parameter + " --rg-sm " + sampleName + ":" + sampleType + ":" + str( sampleTime) if topn > 1: parameter = parameter + " -n " + str(topn) + " --strata " files = [inputReference] files.append(inputBAM) if not isPaired else files.extend(inputBAM) files = [os.path.expanduser(p) for p in files] if checkStep(files, [replaceExtension(outputSAM, ".bam")], force): cmd = "ngm %s -r %s %s -t %s %s -o %s" % ( "" if outputSAM.endswith(".sam") else "-b", files[0], "-q %s" % files[1] if not isPaired else "-1 %s -2 %s" % (files[1], files[2]), threads, parameter, outputSAM) run(cmd, log, verbose=verbose, dry=printOnly) else: print("Skipped mapping for " + inputBAM if not isPaired else inputBAM[0], file=log)
def plotHalfLifes(bed, simDir, slamDir, timePointsStr, conversionRate, outputPDF): simFiles = sorted(glob.glob(simDir + "*_utrsummary.csv")) slamdunkFiles = sorted(glob.glob(slamDir + "*_reads_slamdunk_mapped_filtered_tcount.csv")) timePoints = timePointsStr.split(",") if(len(simFiles) == len(slamdunkFiles) and len(slamdunkFiles) == len(timePoints)): run("Rscript " + pathEvalHalfLifes + " -b " + bed + " -c " + str(conversionRate) + " -s " + ",".join(simFiles) + " -f " + ",".join(slamdunkFiles) + " -t " + ",".join(timePoints) + " -o " + outputPDF, sys.stderr, dry=False, verbose=False) else: raise RuntimeError("Couldn't match files with timepoints")
def plotconversiondifferences(simDir, slamDir, conversionRate, outputPDF): simFiles = sorted(glob.glob(simDir + "*_utrsummary.csv")) slamdunkFiles = sorted( glob.glob(slamDir + "*_reads_slamdunk_mapped_filtered_tcount.csv")) if (len(simFiles) == len(slamdunkFiles)): run("Rscript " + pathEvalConversionrates + " -c " + str(conversionRate) + " -s " + ",".join(simFiles) + " -f " + ",".join(slamdunkFiles) + " -o " + outputPDF, sys.stderr, dry=False, verbose=False) else: raise RuntimeError("Couldn't match files with timepoints")
def bamSort(outputBAM, log, newHeader, verbose): tmp = outputBAM + "_tmp" if(newHeader != None): pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb") pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader) for read in pyOutputBAM: pyTmp.write(read) pyOutputBAM.close() pyTmp.close() else: os.rename(outputBAM, tmp) #run(" ".join(["samtools", "sort", "-@", str(threads) , tmp, replaceExtension(outFile, "")]), log, verbose=verbose, dry=dry) run(" ".join([getBinary("samtools"), "sort", "-o", outputBAM, tmp]), log, verbose=verbose, dry=False) #pysam.sort(tmp, outputBAM) # @UndefinedVariable removeFile(tmp)
def plotHalfLifes(bed, simDir, slamDir, timePointsStr, conversionRate, outputPDF): simFiles = sorted(glob.glob(simDir + "*_utrsummary.csv")) slamdunkFiles = sorted( glob.glob(slamDir + "*_reads_slamdunk_mapped_filtered_tcount.csv")) timePoints = timePointsStr.split(",") if (len(simFiles) == len(slamdunkFiles) and len(slamdunkFiles) == len(timePoints)): run("Rscript " + pathEvalHalfLifes + " -b " + bed + " -c " + str(conversionRate) + " -s " + ",".join(simFiles) + " -f " + ",".join(slamdunkFiles) + " -t " + ",".join(timePoints) + " -o " + outputPDF, sys.stderr, dry=False, verbose=False) else: raise RuntimeError("Couldn't match files with timepoints")
def runSam2bam(inFile, outFile, log, index=True, sort=None, delinFile=False, onlyUnique=False, onlyProperPaired=False, filterMQ=0, L=None, threads=1, verbose=False, dry=False): if delinFile and files_exist(outFile) and not files_exist(inFile): print("Skipping sam2bam for %s" % outFile, file=log) else: if onlyUnique and filterMQ == 0: filterMQ = 1 success = True cmd = [ "samtools view", "-@", str(threads), "-Sb", "-o", outFile, inFile ] if filterMQ > 0: cmd += ["-q", str(filterMQ)] if onlyProperPaired: cmd += ["-f", "2"] if L is not None: cmd += ["-L", L] run(" ".join(cmd), log, verbose=verbose, dry=dry) if sort is not None: tmp = outFile + "_tmp" if not dry: os.rename(outFile, tmp) if sort.lower() == "index": run(" ".join( ["samtools sort", "-@", str(threads), "-o", outFile, tmp]), log, verbose=verbose, dry=dry) elif sort.lower() == "name": run(" ".join([ "samtools sort -n", "-@", str(threads), "-o", outFile, tmp ]), log, verbose=verbose, dry=dry) if success: removeFile(tmp) if success and delinFile: if not dry: removeFile(inFile) if index: pysamIndex(outFile)
def bamSort(outputBAM, log, newHeader, paired, verbose): tmp = outputBAM + "_tmp" if newHeader is not None: pyOutputBAM = pysam.AlignmentFile(outputBAM, "rb") pyTmp = pysam.AlignmentFile(tmp, "wb", header=newHeader) for read in pyOutputBAM: pyTmp.write(read) pyOutputBAM.close() pyTmp.close() else: os.rename(outputBAM, tmp) if not paired: run("samtools sort %s -o %s" % (tmp, outputBAM), log, verbose=verbose, dry=False) else: run("samtools sort -n %s -o %s" % (tmp, outputBAM), log, verbose=verbose) removeFile(tmp)
def addTcConversions(bed, readInFile, readOutFile, pulseTimePoint, chaseTimePoint, utrSummaryFile, conversionRate, librarySize, sampleInfo, labeledTranscripts=-1.0): # Read utrs from BED file utrs = parseUtrBedFile(bed) readOutTemp = readOutFile + "_tmp.sam" #bamheader = { 'HD': {'VN': '1.0'} } #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False) readOutSAM = open(readOutTemp, "w") print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM) utrSummary = open(utrSummaryFile, "w") bedMD5 = md5(bed) print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=utrSummary) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=utrSummary) print(SlamSeqInterval.Header, file=utrSummary) reads = [] lastUtrName = None utrName = None fasta_sequences = SeqIO.parse(open(readInFile), 'fasta') for entry in fasta_sequences: # TODO: Uncomment to go back to pysam #with pysam.FastxFile(readInFile) as fh: #for entry in fh: #utrName = getUtrName(entry.name) utrName = getUtrName(entry.id) if (utrName == lastUtrName): reads.append(entry) elif (lastUtrName == None): reads.append(entry) else: readsCPM = len(reads) * 1000000.0 / librarySize readToConvertPercent = computeConversionRate( utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads( utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) reads = [] lastUtrName = utrName # Last UTR readsCPM = len(reads) * 1000000.0 / librarySize readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads( utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) readOutSAM.close() utrSummary.close() readOutTempBAM = readOutFile + "_tmp.bam" # Convert to BAM run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM) #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False) #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile) #for read in samFile: # bamFile.write(read) #bamFile.close() #samFile.close() # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier #pysam.sort("-o", readOutFile, readOutTempBAM) # @UndefinedVariable run("samtools sort -o " + readOutFile + " " + readOutTempBAM) os.unlink(readOutTemp) os.unlink(readOutTempBAM)
def addTcConversions(bed, readInFile, readOutFile, pulseTimePoint, chaseTimePoint, utrSummaryFile, conversionRate, librarySize, sampleInfo, labeledTranscripts = -1.0): # Read utrs from BED file utrs = parseUtrBedFile(bed) readOutTemp = readOutFile + "_tmp.sam" #bamheader = { 'HD': {'VN': '1.0'} } #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False) readOutSAM = open(readOutTemp, "w") print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM) utrSummary = open(utrSummaryFile, "w") bedMD5 = md5(bed) print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=utrSummary) print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=utrSummary) print(SlamSeqInterval.Header, file=utrSummary) reads = [] lastUtrName = None utrName = None fasta_sequences = SeqIO.parse(open(readInFile),'fasta') for entry in fasta_sequences: # TODO: Uncomment to go back to pysam #with pysam.FastxFile(readInFile) as fh: #for entry in fh: #utrName = getUtrName(entry.name) utrName = getUtrName(entry.id) if(utrName == lastUtrName): reads.append(entry) elif(lastUtrName == None): reads.append(entry) else: readsCPM = len(reads) * 1000000.0 / librarySize; readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) reads = [] lastUtrName = utrName # Last UTR readsCPM = len(reads) * 1000000.0 / librarySize; readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts) readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM) printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent) readOutSAM.close() utrSummary.close() readOutTempBAM = readOutFile + "_tmp.bam" # Convert to BAM run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM) #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False) #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile) #for read in samFile: # bamFile.write(read) #bamFile.close() #samFile.close() # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier #pysam.sort("-o", readOutFile, readOutTempBAM) # @UndefinedVariable run("samtools sort -o " + readOutFile + " " + readOutTempBAM) os.unlink(readOutTemp) os.unlink(readOutTempBAM)
def computeTconversionsAll( ref, snpsFile, bam, outputBedgraphPlus, outputBedgraphPlusNew, outputBedgraphMinus, outputBedgraphMinusNew, conversionThreshold, minQual, is_inverse, log, ): def to_bed_graph(c, data, bedgraph, rn): data /= rn data *= 1000000.0 [print(c, i, i+1, d, file=bedgraph) for i, d in enumerate(data)] chroms_fw = { 'chrI': np.zeros(230218).astype('float32'), 'chrII': np.zeros(813184).astype('float32'), 'chrIII': np.zeros(316620).astype('float32'), 'chrIV': np.zeros(1531933).astype('float32'), 'chrIX': np.zeros(439888).astype('float32'), 'chrM': np.zeros(85779).astype('float32'), 'chrV': np.zeros(576874).astype('float32'), 'chrVI': np.zeros(270161).astype('float32'), 'chrVII': np.zeros(1090940).astype('float32'), 'chrVIII': np.zeros(562643).astype('float32'), 'chrX': np.zeros(745751).astype('float32'), 'chrXI': np.zeros(666816).astype('float32'), 'chrXII': np.zeros(1078177).astype('float32'), 'chrXIII': np.zeros(924431).astype('float32'), 'chrXIV': np.zeros(784333).astype('float32'), 'chrXV': np.zeros(1091291).astype('float32'), 'chrXVI': np.zeros(948066).astype('float32') } chroms_bw = copy.deepcopy(chroms_fw) chroms_fw_new = copy.deepcopy(chroms_fw.copy()) chroms_bw_new = copy.deepcopy(chroms_fw.copy()) readNumber, positiveCount, negativeCount, positiveCountNew, negativeCountNew = 0, 0, 0, 0, 0 bamFile = pysam.AlignmentFile(bam, "rb") if bamFile.header['HD']['SO'] != 'queryname': # Sort bam file sbam = replaceExtension(bam, '.bam', '_sorted') if not os.path.exists(sbam): run( 'samtools sort -n %s -o %s' % (bam, sbam), log ) else: sbam = bam bamFile = pysam.AlignmentFile(sbam, "rb") snps = SNPtools.SNPDictionary(snpsFile) snps.read() # Go through one chr after the other seqIter = SlamSeqIter(bamFile, ref, snps, conversionThreshold, minQual) read1 = None read2 = None for read in seqIter: if not read.isPaired or read.unmappedMate or read.duplicate: continue if read.isSecondRead: read2 = read else: read1 = read read2 = None continue if read1 is None or read2 is None or read1.queryName != read2.queryName: continue readNumber += 1 chrom = read1.chromosome start = np.minimum(read1.startRefPos, read2.startRefPos) end = np.maximum(read2.endRefPos, read2.endRefPos) is_tc_read = read1.isTcRead or read2.isTcRead direction_read = read1 if not is_inverse else read2 if direction_read.direction == ReadDirection.Forward: positiveCount += 1 chroms_fw[chrom][start:end] += 1 if is_tc_read: positiveCountNew += 1 chroms_fw_new[chrom][start:end] += 1 else: negativeCount += 1 chroms_bw[chrom][start:end] += 1 if is_tc_read: negativeCountNew += 1 chroms_bw_new[chrom][start:end] += 1 print("Total reads: %s\n" "Positive reads: %s\n" "Positive reads new: %s\n" "Negative reads: %s\n" "Negative reads new: %s" % (readNumber, positiveCount, positiveCountNew, negativeCount, negativeCountNew), file=log) fileBedgraphPlus = open(outputBedgraphPlus, 'w') fileBedgraphPlusNew = open(outputBedgraphPlusNew, 'w') fileBedgraphMinus = open(outputBedgraphMinus, 'w') fileBedgraphMinusNew = open(outputBedgraphMinusNew, 'w') for chrom in chroms_fw.keys(): to_bed_graph(chrom, chroms_fw[chrom], fileBedgraphPlus, readNumber) to_bed_graph(chrom, chroms_bw[chrom], fileBedgraphMinus, readNumber) to_bed_graph(chrom, chroms_fw_new[chrom], fileBedgraphPlusNew, readNumber) to_bed_graph(chrom, chroms_bw_new[chrom], fileBedgraphMinusNew, readNumber) fileBedgraphPlus.close() fileBedgraphPlusNew.close() fileBedgraphMinus.close() fileBedgraphMinusNew.close()