def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, options): """Does expectation maximisation on sam file to learn the hmm for the sam file. """ #Get cigars and reads fasta file cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar") fHCigars = open(cigars, 'w') reads = os.path.join(target.getGlobalTempDir(), "temp.fa") fHReads = open(reads, 'w') sam = pysam.Samfile(samFile, "r" ) for aR, counter in zip(sam, xrange(sys.maxint)): #Iterate on the sam lines realigning them in parallel aR.query_name = aR.query_name + "_%s" % counter fHCigars.write(getExonerateCigarFormatString(aR, sam) + "\n") fastaWrite(fHReads, aR.query_name, aR.seq) fHCigars.close(); fHReads.close() unnormalisedOutputModel = os.path.join(target.getGlobalTempDir(), "unnormalisedOutputModel.hmm") target.addChildTargetFn(cPecanEm.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile ]), cigars, unnormalisedOutputModel, options)) #Now set up normalisation target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, options))
def paralleliseSamProcessingTargetFn(target, samFile, referenceFastaFile, outputFile, childTargetFn, followOnTargetFn, options): """Parallelise a computation over the alignments in a SAM file. """ #Load reference sequences refSequences = getFastaDictionary( referenceFastaFile) #Hash of names to sequences tempOutputFiles = [] childCount, totalSeqLength = 0, sys.maxint tempExonerateFile, tempQueryFile = None, None tempExonerateFileHandle, tempQueryFileHandle = None, None refName = None #Read through the SAM file sam = pysam.Samfile(samFile, "r") def makeChild(): #Add a child target to do the processing of a subset of the lines. if tempExonerateFile != None: tempExonerateFileHandle.close() tempQueryFileHandle.close() #Temporary cigar file to store the realignment tempOutputFiles.append( os.path.join(target.getGlobalTempDir(), "tempOutput_%i.txt" % childCount)) target.addChildTargetFn(childTargetFn, args=(tempExonerateFile, refName, refSequences[refName], tempQueryFile, tempOutputFiles[-1], options)) for aR, index in zip(samIterator(sam), xrange(sys.maxint)): #Iterate on the sam lines realigning them in parallel if totalSeqLength > options.maxAlignmentLengthPerJob or \ refName != sam.getrname(aR.reference_id): makeChild() tempExonerateFile = os.path.join( target.getGlobalTempDir(), "tempExonerateCigar_%s.cig" % childCount) tempExonerateFileHandle = open(tempExonerateFile, 'w') tempQueryFile = os.path.join(target.getGlobalTempDir(), "tempQueryCigar_%s.fa" % childCount) tempQueryFileHandle = open(tempQueryFile, 'w') childCount += 1 totalSeqLength = 0 tempExonerateFileHandle.write( getExonerateCigarFormatString(aR, sam) + "\n") fastaWrite( tempQueryFileHandle, aR.query_name, aR.query_sequence ) #This is the query sequence, including soft clipped bases, but excluding hard clip bases totalSeqLength += len(aR.query_sequence) refName = sam.getrname(aR.reference_id) makeChild() target.setFollowOnTargetFn(followOnTargetFn, args=(samFile, referenceFastaFile, \ outputFile, tempOutputFiles, options)) #Finish up sam.close()
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) try: command = "echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile); system(command) # target.logToMaster('[good] ' + command + '\n'); except Exception, e: target.logToMaster('Caught an exception! qname = "%s"\n' % querySequenceName); target.logToMaster('len(exonerateCigarString[:-1]) = %d\n' % (len(exonerateCigarString[:-1]))); target.logToMaster('[bad] Command that caused the exception:\n'); target.logToMaster("echo %s | cPecanRealign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile)); target.logToMaster('\n'); target.logToMaster('\n'); target.logToMaster(str(e) + '\n'); target.logToMaster('\n'); continue;
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) #The data we collect avgPosteriorMatchProbabilityInCigar = [] alignedPairsInCigar = [] posteriorMatchProbabilities = [] for aR in samIterator(sam): #Iterate on the sam lines #Exonerate format Cigar string cigarString = getExonerateCigarFormatString(aR, sam) #Temporary files tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig") tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa") tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv") #Write the temporary files. fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) fastaWrite(tempReadFile, aR.qname, aR.query) #Trained hmm file to use. hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt") #Call to cactus_realign system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile)) #Load the cigar and get the posterior prob assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1 pA = [ i for i in cigarRead(open(tempCigarFile)) ][0] avgPosteriorMatchProbabilityInCigar.append(pA.score) #Calculate the number of aligned pairs in the cigar alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ])) assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ]) #Get the posterior probs #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ] sam.close() #Write out the substitution info node = ET.Element("alignmentUncertainty", { "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))), "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))), "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) }) open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node)) if len(avgPosteriorMatchProbabilityInCigar) > 0: outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w") outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n") outf.close() system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf"))) #Indicate everything is all done self.finish()
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict( zip(BASES, [0.0] * len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][ queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def paralleliseSamProcessingTargetFn(target, samFile, referenceFastaFile, outputFile, childTargetFn, followOnTargetFn, options): """Parallelise a computation over the alignments in a SAM file. """ #Load reference sequences refSequences = getFastaDictionary(referenceFastaFile) #Hash of names to sequences tempOutputFiles = [] childCount, totalSeqLength = 0, sys.maxint tempExonerateFile, tempQueryFile = None, None tempExonerateFileHandle, tempQueryFileHandle = None, None refName = None #Read through the SAM file sam = pysam.Samfile(samFile, "r" ) def makeChild(): #Add a child target to do the processing of a subset of the lines. if tempExonerateFile != None: tempExonerateFileHandle.close() tempQueryFileHandle.close() #Temporary cigar file to store the realignment tempOutputFiles.append(os.path.join(target.getGlobalTempDir(), "tempOutput_%i.txt" % childCount)) target.addChildTargetFn(childTargetFn, args=(tempExonerateFile, refName, refSequences[refName], tempQueryFile, tempOutputFiles[-1], options)) for aR, index in zip(samIterator(sam), xrange(sys.maxint)): #Iterate on the sam lines realigning them in parallel if totalSeqLength > options.maxAlignmentLengthPerJob or \ refName != sam.getrname(aR.reference_id): makeChild() tempExonerateFile = os.path.join(target.getGlobalTempDir(), "tempExonerateCigar_%s.cig" % childCount) tempExonerateFileHandle = open(tempExonerateFile, 'w') tempQueryFile = os.path.join(target.getGlobalTempDir(), "tempQueryCigar_%s.fa" % childCount) tempQueryFileHandle = open(tempQueryFile, 'w') childCount += 1 totalSeqLength = 0 tempExonerateFileHandle.write(getExonerateCigarFormatString(aR, sam) + "\n") fastaWrite(tempQueryFileHandle, aR.query_name, aR.query_sequence) #This is the query sequence, including soft clipped bases, but excluding hard clip bases totalSeqLength += len(aR.query_sequence) refName = sam.getrname(aR.reference_id) makeChild() target.setFollowOnTargetFn(followOnTargetFn, args=(samFile, referenceFastaFile, \ outputFile, tempOutputFiles, options)) #Finish up sam.close()
def makeFastaSequenceNamesUnique(inputFastaFile, outputFastaFile): """Makes a fasta file with unique names """ names = set() fileHandle = open(outputFastaFile, 'w') for name, seq in fastaRead(open(inputFastaFile, 'r')): while name in names: logger.critical("Got a duplicate fasta sequence name: %s" % name) name += "i" names.add(name) fastaWrite(fileHandle, name, seq) fileHandle.close() return outputFastaFile
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
def posteriorProbabilityCalculationTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputPosteriorProbsFile, options): """Calculates the posterior probabilities of matches in a set of pairwise alignments between a reference sequence and a set of reads. """ #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #Hash to store posterior probabilities in expectationsOfBasesAtEachPosition = {} #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign tempPosteriorProbsFile = os.path.join(target.getLocalTempDir(), "posteriorProbs.txt") if options.noMargin: #When we don't marginialize we just run cPecanRealign to get the list of aligned pairs #This runtime should be very fast system("echo %s | cPecanRealign %s %s --diagonalExpansion=0 \ --splitMatrixBiggerThanThis=1 --rescoreOriginalAlignment --outputPosteriorProbs=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile)) else: system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, tempPosteriorProbsFile, options.alignmentModel)) #Now collate the reference position expectations for refPosition, queryPosition, posteriorProb in \ map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): assert posteriorProb <= 1.01 assert posteriorProb >= 0.0 key = (referenceSequenceName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict(zip(BASES, [0.0]*len(BASES))) queryBase = querySequence[int(queryPosition)].upper() if queryBase in BASES: #Could be an N or other wildcard character, which we ignore expectationsOfBasesAtEachPosition[key][queryBase] += 1.0 if options.noMargin else posteriorProb #Pickle the posterior probs fileHandle = open(outputPosteriorProbsFile, 'w') cPickle.dump(expectationsOfBasesAtEachPosition, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close()
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system( "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0 assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) system("echo \"%s\" | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile))
def realignCigarTargetFn(target, exonerateCigarStringFile, referenceSequenceName, referenceSequence, querySequenceFile, outputCigarFile, options): #Temporary files tempRefFile = os.path.join(target.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getLocalTempDir(), "read.fa") #Write the temporary reference file. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) #For each cigar string for exonerateCigarString, (querySequenceName, querySequence) in \ zip(open(exonerateCigarStringFile, "r"), fastaRead(querySequenceFile)): fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cPecanRealign loadHmm = nameValue("loadHmm", options.hmmFile) system("echo %s | cPecanRealign %s %s --diagonalExpansion=10 \ --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s >> %s" % \ (exonerateCigarString[:-1], tempRefFile, tempReadFile, loadHmm, options.gapGamma, options.matchGamma, outputCigarFile))
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, outputModel): """Does expectation maximisation on sam file to learn the hmm for the sam file. """ #Convert the read file to fasta refSequences = getFastaDictionary(referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(readFastqFile) #Hash of names to sequences reads = os.path.join(target.getGlobalTempDir(), "temp.fa") fH = open(reads, 'w') for name in readSequences.keys(): seq = readSequences[name] fastaWrite(fH, name, seq) fastaWrite(fH, name + "_reverse", reverseComplement(seq)) fH.close() #Get cigars file cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar") fH = open(cigars, 'w') sam = pysam.Samfile(samFile, "r" ) for aR in sam: #Iterate on the sam lines realigning them in parallel #Because these are global alignments with reverse complement coordinates reversed the following should all be true assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSequences[aR.qname]) #aR.query) assert aR.aend == len(refSequences[sam.getrname(aR.rname)]) assert len(aR.query) == len(readSequences[aR.qname]) if aR.is_reverse: #Deal with reverse complements assert aR.query.upper() == reverseComplement(readSequences[aR.qname]).upper() aR.qname += "_reverse" else: assert aR.query.upper() == readSequences[aR.qname].upper() fH.write(getExonerateCigarFormatString(aR, sam) + "\n") #Exonerate format Cigar string, using global coordinates #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n") fH.close() #Run cactus_expectationMaximisation options = cactus_expectationMaximisation.Options() options.modelType="fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign="--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob=700000 options.maxAlignmentLengthToSample = 50000000 options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions=True #options.tieEmissions = True unnormalisedOutputModel = outputModel + "_unnormalised" #Do training if necessary if not os.path.exists(unnormalisedOutputModel): target.addChildTargetFn(cactus_expectationMaximisation.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile ]), cigars, unnormalisedOutputModel, options)) #Now set up normalisation target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, outputModel))
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences node = ET.Element("marginAlignComparison") for hmmType in ("cactus", "trained_0", "trained_20", "trained_40"): for coverage in (1000000, 120, 60, 30, 10): for replicate in xrange(3 if coverage < 1000000 else 1): #Do replicates, unless coverage is all sam = pysam.Samfile(self.samFile, "r" ) #Trained hmm file to use.q hmmFile0 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt") hmmFile20 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_20.txt") hmmFile40 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_40.txt") #Get substitution matrices nullSubstitionMatrix = getNullSubstitutionMatrix() flatSubstitutionMatrix = getJukesCantorTypeSubstitutionMatrix() hmmErrorSubstitutionMatrix = loadHmmErrorSubstitutionMatrix(hmmFile20) #Load the held out snps snpSet = {} referenceAlignmentFile = self.referenceFastaFile + "_Index.txt" if os.path.exists(referenceAlignmentFile): seqsAndMutatedSeqs = getFastaDictionary(referenceAlignmentFile) count = 0 for name in seqsAndMutatedSeqs: if name in refSequences: count += 1 trueSeq = seqsAndMutatedSeqs[name] mutatedSeq = seqsAndMutatedSeqs[name + "_mutated"] assert mutatedSeq == refSequences[name] for i in xrange(len(trueSeq)): if trueSeq[i] != mutatedSeq[i]: snpSet[(name, i)] = trueSeq[i] else: assert name.split("_")[-1] == "mutated" assert count == len(refSequences.keys()) #The data we collect expectationsOfBasesAtEachPosition = {} frequenciesOfAlignedBasesAtEachPosition = {} totalSampledReads = 0 totalAlignedPairs = 0 totalReadLength = 0 totalReferenceLength = sum(map(len, refSequences.values())) #Get a randomised ordering for the reads reads = [ aR for aR in samIterator(sam) ] random.shuffle(reads) for aR in reads: #Iterate on the sam lines if totalReadLength/totalReferenceLength >= coverage: #Stop when coverage exceeds the quota break totalReadLength += len(readSequences[aR.qname]) totalSampledReads += 1 #Temporary files tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig") tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa") tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv") #Ref name refSeqName = sam.getrname(aR.rname) #Sequences refSeq = refSequences[sam.getrname(aR.rname)] #Walk through the aligned pairs to collate the bases of aligned positions for aP in AlignedPair.iterator(aR, refSeq, readSequences[aR.qname]): totalAlignedPairs += 1 #Record an aligned pair key = (refSeqName, aP.refPos) if key not in frequenciesOfAlignedBasesAtEachPosition: frequenciesOfAlignedBasesAtEachPosition[key] = dict(zip(bases, [0.0]*len(bases))) readBase = aP.getReadBase() #readSeq[aP.readPos].upper() #Use the absolute read, ins if readBase in bases: frequenciesOfAlignedBasesAtEachPosition[key][readBase] += 1 #Write the temporary files. readSeq = aR.query #This excludes bases that were soft-clipped and is always of positive strand coordinates fastaWrite(tempRefFile, refSeqName, refSeq) fastaWrite(tempReadFile, aR.qname, readSeq) #Exonerate format Cigar string, which is in readSeq coordinates (positive strand). assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSeq) assert aR.aend == len(refSeq) cigarString = getExonerateCigarFormatString(aR, sam) #Call to cactus_realign if hmmType == "trained_0": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile0, tempCigarFile)) elif hmmType == "trained_20": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile20, tempCigarFile)) elif hmmType == "trained_40": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile40, tempCigarFile)) else: system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, tempCigarFile)) #Now collate the reference position expectations for refPosition, readPosition, posteriorProb in map(lambda x : map(float, x.split()), open(tempPosteriorProbsFile, 'r')): key = (refSeqName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict(zip(bases, [0.0]*len(bases))) readBase = readSeq[int(readPosition)].upper() if readBase in bases: expectationsOfBasesAtEachPosition[key][readBase] += posteriorProb #Collate aligned positions from cigars sam.close() totalHeldOut = len(snpSet) totalNotHeldOut = totalReferenceLength - totalHeldOut class SnpCalls: def __init__(self): self.falsePositives = [] self.truePositives = [] self.falseNegatives = [] self.notCalled = 0 @staticmethod def bucket(calls): calls = calls[:] calls.sort() buckets = [0.0]*101 for prob in calls: #Discretize buckets[int(round(prob*100))] += 1 for i in xrange(len(buckets)-2, -1, -1): #Make cumulative buckets[i] += buckets[i+1] return buckets def getPrecisionByProbability(self): tPs = self.bucket(map(lambda x : x[0], self.truePositives)) fPs = self.bucket(map(lambda x : x[0], self.falsePositives)) return map(lambda i : float(tPs[i]) / (tPs[i] + fPs[i]) if tPs[i] + fPs[i] != 0 else 0, xrange(len(tPs))) def getRecallByProbability(self): return map(lambda i : i/totalHeldOut if totalHeldOut != 0 else 0, self.bucket(map(lambda x : x[0], self.truePositives))) def getTruePositiveLocations(self): return map(lambda x : x[1], self.truePositives) def getFalsePositiveLocations(self): return map(lambda x : x[1], self.falsePositives) def getFalseNegativeLocations(self): return map(lambda x : x[0], self.falseNegatives) #The different call sets marginAlignMaxExpectedSnpCalls = SnpCalls() marginAlignMaxLikelihoodSnpCalls = SnpCalls() maxFrequencySnpCalls = SnpCalls() maximumLikelihoodSnpCalls = SnpCalls() #Now calculate the calls for refSeqName in refSequences: refSeq = refSequences[refSeqName] for refPosition in xrange(len(refSeq)): mutatedRefBase = refSeq[refPosition].upper() trueRefBase = (mutatedRefBase if not (refSeqName, refPosition) in snpSet else snpSet[(refSeqName, refPosition)]).upper() key = (refSeqName, refPosition) #Get base calls for errorSubstitutionMatrix, evolutionarySubstitutionMatrix, baseExpectations, snpCalls in \ ((flatSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxExpectedSnpCalls), (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxLikelihoodSnpCalls), (flatSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maxFrequencySnpCalls), (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maximumLikelihoodSnpCalls)): if key in baseExpectations: #Get posterior likelihoods expectations = baseExpectations[key] totalExpectation = sum(expectations.values()) if totalExpectation > 0.0: #expectationCallingThreshold: posteriorProbs = calcBasePosteriorProbs(dict(zip(bases, map(lambda x : float(expectations[x])/totalExpectation, bases))), mutatedRefBase, evolutionarySubstitutionMatrix, errorSubstitutionMatrix) probs = [ posteriorProbs[base] for base in "ACGT" ] #posteriorProbs.pop(mutatedRefBase) #Remove the ref base. #maxPosteriorProb = max(posteriorProbs.values()) #chosenBase = random.choice([ base for base in posteriorProbs if posteriorProbs[base] == maxPosteriorProb ]).upper() #Very naive way to call the base for chosenBase in "ACGT": if chosenBase != mutatedRefBase: maxPosteriorProb = posteriorProbs[chosenBase] if trueRefBase != mutatedRefBase and trueRefBase == chosenBase: snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive else: snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive """ snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative if trueRefBase != mutatedRefBase: if trueRefBase == chosenBase: snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive else: snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative else: snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive """ else: snpCalls.notCalled += 1 #Now find max-fscore point for snpCalls, tagName in ((marginAlignMaxExpectedSnpCalls, "marginAlignMaxExpectedSnpCalls"), (marginAlignMaxLikelihoodSnpCalls, "marginAlignMaxLikelihoodSnpCalls"), (maxFrequencySnpCalls, "maxFrequencySnpCalls"), (maximumLikelihoodSnpCalls, "maximumLikelihoodSnpCalls")): recall = snpCalls.getRecallByProbability() precision = snpCalls.getPrecisionByProbability() assert len(recall) == len(precision) fScore, pIndex = max(map(lambda i : (2 * recall[i] * precision[i] / (recall[i] + precision[i]) if recall[i] + precision[i] > 0 else 0.0, i), range(len(recall)))) truePositives = snpCalls.getRecallByProbability()[pIndex] falsePositives = snpCalls.getPrecisionByProbability()[pIndex] optimumProbThreshold = float(pIndex)/100.0 #Write out the substitution info node2 = ET.SubElement(node, tagName + "_" + hmmType, { "coverage":str(coverage), "actualCoverage":str(float(totalAlignedPairs)/totalReferenceLength), "totalAlignedPairs":str(totalAlignedPairs), "totalReferenceLength":str(totalReferenceLength), "replicate":str(replicate), "totalReads":str(len(reads)), "avgSampledReadLength":str(float(totalReadLength)/totalSampledReads), "totalSampledReads":str(totalSampledReads), "totalHeldOut":str(totalHeldOut), "totalNonHeldOut":str(totalNotHeldOut), "recall":str(recall[pIndex]), "precision":str(precision[pIndex]), "fScore":str(fScore), "optimumProbThreshold":str(optimumProbThreshold), "totalNoCalls":str(snpCalls.notCalled), "recallByProbability":" ".join(map(str, snpCalls.getRecallByProbability())), "precisionByProbability":" ".join(map(str, snpCalls.getPrecisionByProbability())) }) #"falsePositiveLocations":" ".join(map(str, snpCalls.getFalsePositiveLocations())), #"falseNegativeLocations":" ".join(map(str, snpCalls.getFalseNegativeLocations())), #"truePositiveLocations":" ".join(map(str, snpCalls.getTruePositiveLocations())) }) for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives: ET.SubElement(node2, "falseNegative_%s_%s" % (trueRefBase, mutatedRefBase), { "posteriorProbs":" ".join(map(str, posteriorProbs))}) for falseNegativeBase in bases: for mutatedBase in bases: posteriorProbsArray = [ posteriorProbs for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives if (trueRefBase.upper() == falseNegativeBase.upper() and mutatedBase.upper() == mutatedRefBase.upper() ) ] if len(posteriorProbsArray) > 0: summedProbs = reduce(lambda x, y : map(lambda i : x[i] + y[i], xrange(len(x))), posteriorProbsArray) summedProbs = map(lambda x : float(x)/sum(summedProbs), summedProbs) ET.SubElement(node2, "combinedFalseNegative_%s_%s" % (falseNegativeBase, mutatedBase), { "posteriorProbs":" ".join(map(str, summedProbs))}) open(os.path.join(self.outputDir, "marginaliseConsensus.xml"), "w").write(prettyXml(node)) #Indicate everything is all done self.finish()
def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary( self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( self.readFastqFile) #Hash of names to sequences node = ET.Element("marginAlignComparison") for hmmType in ("cactus", "trained_0", "trained_20", "trained_40"): for coverage in (1000000, 120, 60, 30, 10): for replicate in xrange( 3 if coverage < 1000000 else 1 ): #Do replicates, unless coverage is all sam = pysam.Samfile(self.samFile, "r") #Trained hmm file to use.q hmmFile0 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt") hmmFile20 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_20.txt") hmmFile40 = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_40.txt") #Get substitution matrices nullSubstitionMatrix = getNullSubstitutionMatrix() flatSubstitutionMatrix = getJukesCantorTypeSubstitutionMatrix( ) hmmErrorSubstitutionMatrix = loadHmmErrorSubstitutionMatrix( hmmFile20) #Load the held out snps snpSet = {} referenceAlignmentFile = self.referenceFastaFile + "_Index.txt" if os.path.exists(referenceAlignmentFile): seqsAndMutatedSeqs = getFastaDictionary( referenceAlignmentFile) count = 0 for name in seqsAndMutatedSeqs: if name in refSequences: count += 1 trueSeq = seqsAndMutatedSeqs[name] mutatedSeq = seqsAndMutatedSeqs[name + "_mutated"] assert mutatedSeq == refSequences[name] for i in xrange(len(trueSeq)): if trueSeq[i] != mutatedSeq[i]: snpSet[(name, i)] = trueSeq[i] else: assert name.split("_")[-1] == "mutated" assert count == len(refSequences.keys()) #The data we collect expectationsOfBasesAtEachPosition = {} frequenciesOfAlignedBasesAtEachPosition = {} totalSampledReads = 0 totalAlignedPairs = 0 totalReadLength = 0 totalReferenceLength = sum(map(len, refSequences.values())) #Get a randomised ordering for the reads reads = [aR for aR in samIterator(sam)] random.shuffle(reads) for aR in reads: #Iterate on the sam lines if totalReadLength / totalReferenceLength >= coverage: #Stop when coverage exceeds the quota break totalReadLength += len(readSequences[aR.qname]) totalSampledReads += 1 #Temporary files tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig") tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa") tempPosteriorProbsFile = os.path.join( self.getLocalTempDir(), "probs.tsv") #Ref name refSeqName = sam.getrname(aR.rname) #Sequences refSeq = refSequences[sam.getrname(aR.rname)] #Walk through the aligned pairs to collate the bases of aligned positions for aP in AlignedPair.iterator( aR, refSeq, readSequences[aR.qname]): totalAlignedPairs += 1 #Record an aligned pair key = (refSeqName, aP.refPos) if key not in frequenciesOfAlignedBasesAtEachPosition: frequenciesOfAlignedBasesAtEachPosition[ key] = dict(zip(bases, [0.0] * len(bases))) readBase = aP.getReadBase( ) #readSeq[aP.readPos].upper() #Use the absolute read, ins if readBase in bases: frequenciesOfAlignedBasesAtEachPosition[key][ readBase] += 1 #Write the temporary files. readSeq = aR.query #This excludes bases that were soft-clipped and is always of positive strand coordinates fastaWrite(tempRefFile, refSeqName, refSeq) fastaWrite(tempReadFile, aR.qname, readSeq) #Exonerate format Cigar string, which is in readSeq coordinates (positive strand). assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSeq) assert aR.aend == len(refSeq) cigarString = getExonerateCigarFormatString(aR, sam) #Call to cactus_realign if hmmType == "trained_0": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile0, tempCigarFile)) elif hmmType == "trained_20": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile20, tempCigarFile)) elif hmmType == "trained_40": system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile40, tempCigarFile)) else: system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputAllPosteriorProbs=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, tempCigarFile)) #Now collate the reference position expectations for refPosition, readPosition, posteriorProb in map( lambda x: map(float, x.split()), open(tempPosteriorProbsFile, 'r')): key = (refSeqName, int(refPosition)) if key not in expectationsOfBasesAtEachPosition: expectationsOfBasesAtEachPosition[key] = dict( zip(bases, [0.0] * len(bases))) readBase = readSeq[int(readPosition)].upper() if readBase in bases: expectationsOfBasesAtEachPosition[key][ readBase] += posteriorProb #Collate aligned positions from cigars sam.close() totalHeldOut = len(snpSet) totalNotHeldOut = totalReferenceLength - totalHeldOut class SnpCalls: def __init__(self): self.falsePositives = [] self.truePositives = [] self.falseNegatives = [] self.notCalled = 0 @staticmethod def bucket(calls): calls = calls[:] calls.sort() buckets = [0.0] * 101 for prob in calls: #Discretize buckets[int(round(prob * 100))] += 1 for i in xrange(len(buckets) - 2, -1, -1): #Make cumulative buckets[i] += buckets[i + 1] return buckets def getPrecisionByProbability(self): tPs = self.bucket( map(lambda x: x[0], self.truePositives)) fPs = self.bucket( map(lambda x: x[0], self.falsePositives)) return map( lambda i: float(tPs[i]) / (tPs[i] + fPs[i]) if tPs[i] + fPs[i] != 0 else 0, xrange(len(tPs))) def getRecallByProbability(self): return map( lambda i: i / totalHeldOut if totalHeldOut != 0 else 0, self.bucket( map(lambda x: x[0], self.truePositives))) def getTruePositiveLocations(self): return map(lambda x: x[1], self.truePositives) def getFalsePositiveLocations(self): return map(lambda x: x[1], self.falsePositives) def getFalseNegativeLocations(self): return map(lambda x: x[0], self.falseNegatives) #The different call sets marginAlignMaxExpectedSnpCalls = SnpCalls() marginAlignMaxLikelihoodSnpCalls = SnpCalls() maxFrequencySnpCalls = SnpCalls() maximumLikelihoodSnpCalls = SnpCalls() #Now calculate the calls for refSeqName in refSequences: refSeq = refSequences[refSeqName] for refPosition in xrange(len(refSeq)): mutatedRefBase = refSeq[refPosition].upper() trueRefBase = ( mutatedRefBase if not (refSeqName, refPosition) in snpSet else snpSet[(refSeqName, refPosition)]).upper() key = (refSeqName, refPosition) #Get base calls for errorSubstitutionMatrix, evolutionarySubstitutionMatrix, baseExpectations, snpCalls in \ ((flatSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxExpectedSnpCalls), (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, expectationsOfBasesAtEachPosition, marginAlignMaxLikelihoodSnpCalls), (flatSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maxFrequencySnpCalls), (hmmErrorSubstitutionMatrix, nullSubstitionMatrix, frequenciesOfAlignedBasesAtEachPosition, maximumLikelihoodSnpCalls)): if key in baseExpectations: #Get posterior likelihoods expectations = baseExpectations[key] totalExpectation = sum( expectations.values()) if totalExpectation > 0.0: #expectationCallingThreshold: posteriorProbs = calcBasePosteriorProbs( dict( zip( bases, map( lambda x: float( expectations[x]) / totalExpectation, bases))), mutatedRefBase, evolutionarySubstitutionMatrix, errorSubstitutionMatrix) probs = [ posteriorProbs[base] for base in "ACGT" ] #posteriorProbs.pop(mutatedRefBase) #Remove the ref base. #maxPosteriorProb = max(posteriorProbs.values()) #chosenBase = random.choice([ base for base in posteriorProbs if posteriorProbs[base] == maxPosteriorProb ]).upper() #Very naive way to call the base for chosenBase in "ACGT": if chosenBase != mutatedRefBase: maxPosteriorProb = posteriorProbs[ chosenBase] if trueRefBase != mutatedRefBase and trueRefBase == chosenBase: snpCalls.truePositives.append( (maxPosteriorProb, refPosition )) #True positive else: snpCalls.falsePositives.append( (maxPosteriorProb, refPosition )) #False positive """ snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative if trueRefBase != mutatedRefBase: if trueRefBase == chosenBase: snpCalls.truePositives.append((maxPosteriorProb, refPosition)) #True positive else: snpCalls.falseNegatives.append((refPosition, trueRefBase, mutatedRefBase, probs)) #False negative else: snpCalls.falsePositives.append((maxPosteriorProb, refPosition)) #False positive """ else: snpCalls.notCalled += 1 #Now find max-fscore point for snpCalls, tagName in ( (marginAlignMaxExpectedSnpCalls, "marginAlignMaxExpectedSnpCalls"), (marginAlignMaxLikelihoodSnpCalls, "marginAlignMaxLikelihoodSnpCalls"), (maxFrequencySnpCalls, "maxFrequencySnpCalls"), (maximumLikelihoodSnpCalls, "maximumLikelihoodSnpCalls")): recall = snpCalls.getRecallByProbability() precision = snpCalls.getPrecisionByProbability() assert len(recall) == len(precision) fScore, pIndex = max( map( lambda i: (2 * recall[i] * precision[i] / (recall[i] + precision[i]) if recall[i] + precision[i] > 0 else 0.0, i), range(len(recall)))) truePositives = snpCalls.getRecallByProbability( )[pIndex] falsePositives = snpCalls.getPrecisionByProbability( )[pIndex] optimumProbThreshold = float(pIndex) / 100.0 #Write out the substitution info node2 = ET.SubElement( node, tagName + "_" + hmmType, { "coverage": str(coverage), "actualCoverage": str( float(totalAlignedPairs) / totalReferenceLength), "totalAlignedPairs": str(totalAlignedPairs), "totalReferenceLength": str(totalReferenceLength), "replicate": str(replicate), "totalReads": str(len(reads)), "avgSampledReadLength": str( float(totalReadLength) / totalSampledReads), "totalSampledReads": str(totalSampledReads), "totalHeldOut": str(totalHeldOut), "totalNonHeldOut": str(totalNotHeldOut), "recall": str(recall[pIndex]), "precision": str(precision[pIndex]), "fScore": str(fScore), "optimumProbThreshold": str(optimumProbThreshold), "totalNoCalls": str(snpCalls.notCalled), "recallByProbability": " ".join( map(str, snpCalls.getRecallByProbability())), "precisionByProbability": " ".join( map(str, snpCalls.getPrecisionByProbability())) }) #"falsePositiveLocations":" ".join(map(str, snpCalls.getFalsePositiveLocations())), #"falseNegativeLocations":" ".join(map(str, snpCalls.getFalseNegativeLocations())), #"truePositiveLocations":" ".join(map(str, snpCalls.getTruePositiveLocations())) }) for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives: ET.SubElement( node2, "falseNegative_%s_%s" % (trueRefBase, mutatedRefBase), { "posteriorProbs": " ".join(map(str, posteriorProbs)) }) for falseNegativeBase in bases: for mutatedBase in bases: posteriorProbsArray = [ posteriorProbs for refPosition, trueRefBase, mutatedRefBase, posteriorProbs in snpCalls.falseNegatives if (trueRefBase.upper() == falseNegativeBase.upper() and mutatedBase.upper() == mutatedRefBase.upper()) ] if len(posteriorProbsArray) > 0: summedProbs = reduce( lambda x, y: map( lambda i: x[i] + y[i], xrange(len(x))), posteriorProbsArray) summedProbs = map( lambda x: float(x) / sum(summedProbs), summedProbs) ET.SubElement( node2, "combinedFalseNegative_%s_%s" % (falseNegativeBase, mutatedBase), { "posteriorProbs": " ".join(map(str, summedProbs)) }) open(os.path.join(self.outputDir, "marginaliseConsensus.xml"), "w").write(prettyXml(node)) #Indicate everything is all done self.finish()
def learnModelFromSamFileTargetFn(target, samFile, readFastqFile, referenceFastaFile, outputModel): """Does expectation maximisation on sam file to learn the hmm for the sam file. """ #Convert the read file to fasta refSequences = getFastaDictionary( referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary( readFastqFile) #Hash of names to sequences reads = os.path.join(target.getGlobalTempDir(), "temp.fa") fH = open(reads, 'w') for name in readSequences.keys(): seq = readSequences[name] fastaWrite(fH, name, seq) fastaWrite(fH, name + "_reverse", reverseComplement(seq)) fH.close() #Get cigars file cigars = os.path.join(target.getGlobalTempDir(), "temp.cigar") fH = open(cigars, 'w') sam = pysam.Samfile(samFile, "r") for aR in sam: #Iterate on the sam lines realigning them in parallel #Because these are global alignments with reverse complement coordinates reversed the following should all be true assert aR.pos == 0 assert aR.qstart == 0 assert aR.qend == len(readSequences[aR.qname]) #aR.query) assert aR.aend == len(refSequences[sam.getrname(aR.rname)]) assert len(aR.query) == len(readSequences[aR.qname]) if aR.is_reverse: #Deal with reverse complements assert aR.query.upper() == reverseComplement( readSequences[aR.qname]).upper() aR.qname += "_reverse" else: assert aR.query.upper() == readSequences[aR.qname].upper() fH.write(getExonerateCigarFormatString(aR, sam) + "\n") #Exonerate format Cigar string, using global coordinates #fH.write(getGlobalAlignmentExonerateCigarFormatString(aR, sam, refSequences[sam.getrname(aR.rname)], readSequences[aR.qname]) + "\n") fH.close() #Run cactus_expectationMaximisation options = cactus_expectationMaximisation.Options() options.modelType = "fiveStateAsymmetric" #"threeStateAsymmetric" options.optionsToRealign = "--diagonalExpansion=10 --splitMatrixBiggerThanThis=300" options.randomStart = True options.trials = 3 options.outputTrialHmms = True options.iterations = 100 options.maxAlignmentLengthPerJob = 700000 options.maxAlignmentLengthToSample = 50000000 options.outputXMLModelFile = outputModel + ".xml" #options.updateTheBand = True #options.useDefaultModelAsStart = True #options.setJukesCantorStartingEmissions=0.3 options.trainEmissions = True #options.tieEmissions = True unnormalisedOutputModel = outputModel + "_unnormalised" #Do training if necessary if not os.path.exists(unnormalisedOutputModel): target.addChildTargetFn( cactus_expectationMaximisation.expectationMaximisationTrials, args=(" ".join([reads, referenceFastaFile]), cigars, unnormalisedOutputModel, options)) #Now set up normalisation target.setFollowOnTargetFn(learnModelFromSamFileTargetFn2, args=(unnormalisedOutputModel, outputModel))