def run(self): AbstractAnalysis.run(self) #Call base method to do some logging refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences sam = pysam.Samfile(self.samFile, "r" ) #The data we collect avgPosteriorMatchProbabilityInCigar = [] alignedPairsInCigar = [] posteriorMatchProbabilities = [] for aR in samIterator(sam): #Iterate on the sam lines #Exonerate format Cigar string cigarString = getExonerateCigarFormatString(aR, sam) #Temporary files tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig") tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa") tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa") tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv") #Write the temporary files. fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) fastaWrite(tempReadFile, aR.qname, aR.query) #Trained hmm file to use. hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt") #Call to cactus_realign system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \ (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile)) #Load the cigar and get the posterior prob assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1 pA = [ i for i in cigarRead(open(tempCigarFile)) ][0] avgPosteriorMatchProbabilityInCigar.append(pA.score) #Calculate the number of aligned pairs in the cigar alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ])) assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ]) #Get the posterior probs #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ] sam.close() #Write out the substitution info node = ET.Element("alignmentUncertainty", { "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))), "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))), "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) }) open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node)) if len(avgPosteriorMatchProbabilityInCigar) > 0: outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w") outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n") outf.close() system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf"))) #Indicate everything is all done self.finish()
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0 assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma): #Temporary files tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa") tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa") #Write the temporary files. fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) fastaWrite(tempReadFile, querySequenceName, querySequence) #Call to cactus_realign loadHmm = nameValue("loadHmm", hmmFile) system( "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile)) assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0 assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
def realignSamFile3TargetFn(target, samFile, outputSamFile, tempCigarFiles): #Setup input and output sam files sam = pysam.Samfile(samFile, "r" ) #Replace the cigar lines with the realigned cigar lines outputSam = pysam.Samfile(outputSamFile, "wh", template=sam) for aR, tempCigarFile in zip(samIterator(sam), tempCigarFiles): #Iterate on the sam lines realigning them in parallel #Load the cigar pA = [ i for i in cigarRead(open(tempCigarFile)) ][0] #Convert to sam line aR.cigar = tuple([ (op.type, op.length) for op in pA.operationList ]) #Write out outputSam.write(aR) #Finish up sam.close() outputSam.close()
def realignSamFile3TargetFn(target, samFile, outputSamFile, tempCigarFiles): #Setup input and output sam files sam = pysam.Samfile(samFile, "r") #Replace the cigar lines with the realigned cigar lines outputSam = pysam.Samfile(outputSamFile, "wh", template=sam) for aR, tempCigarFile in zip( samIterator(sam), tempCigarFiles ): #Iterate on the sam lines realigning them in parallel #Load the cigar pA = [i for i in cigarRead(open(tempCigarFile))][0] #Convert to sam line aR.cigar = tuple([(op.type, op.length) for op in pA.operationList]) #Write out outputSam.write(aR) #Finish up sam.close() outputSam.close()
def cigarIterator(): #Iterates over all the cigars in the temp files. for tempCigarFile in tempCigarFiles: for pA in cigarRead(open(tempCigarFile)): yield pA yield None #This is put in to cause an error if there is fewer