Beispiel #1
0
    def run(self):
        AbstractAnalysis.run(self) #Call base method to do some logging
        refSequences = getFastaDictionary(self.referenceFastaFile) #Hash of names to sequences
        readSequences = getFastqDictionary(self.readFastqFile) #Hash of names to sequences
        sam = pysam.Samfile(self.samFile, "r" )

        #The data we collect
        avgPosteriorMatchProbabilityInCigar = []
        alignedPairsInCigar = []
        posteriorMatchProbabilities = []

        for aR in samIterator(sam): #Iterate on the sam lines
            #Exonerate format Cigar string
            cigarString = getExonerateCigarFormatString(aR, sam)
            
            #Temporary files
            tempCigarFile = os.path.join(self.getLocalTempDir(), "rescoredCigar.cig")
            tempRefFile = os.path.join(self.getLocalTempDir(), "ref.fa")
            tempReadFile = os.path.join(self.getLocalTempDir(), "read.fa")
            tempPosteriorProbsFile = os.path.join(self.getLocalTempDir(), "probs.tsv")
            
            #Write the temporary files.
            fastaWrite(tempRefFile, sam.getrname(aR.rname), refSequences[sam.getrname(aR.rname)]) 
            fastaWrite(tempReadFile, aR.qname, aR.query)
            
            #Trained hmm file to use.
            hmmFile = os.path.join(pathToBaseNanoporeDir(), "nanopore", "mappers", "blasr_hmm_0.txt")
            
            #Call to cactus_realign
            system("echo %s | cactus_realign %s %s --rescoreByPosteriorProbIgnoringGaps --rescoreOriginalAlignment --diagonalExpansion=10 --splitMatrixBiggerThanThis=100 --outputPosteriorProbs=%s --loadHmm=%s > %s" % \
                   (cigarString, tempRefFile, tempReadFile, tempPosteriorProbsFile, hmmFile, tempCigarFile))
            
            #Load the cigar and get the posterior prob
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) > 0
            assert len([ pA for pA in cigarRead(open(tempCigarFile)) ]) == 1
            pA = [ i for i in cigarRead(open(tempCigarFile)) ][0]
            avgPosteriorMatchProbabilityInCigar.append(pA.score)
            
            #Calculate the number of aligned pairs in the cigar
            alignedPairsInCigar.append(sum([ op.length for op in pA.operationList if op.type == PairwiseAlignment.PAIRWISE_MATCH ]))
            assert alignedPairsInCigar[-1] == len([ readPos for readPos, refPos in aR.aligned_pairs if readPos != None and refPos != None ])
            
            #Get the posterior probs
            #posteriorMatchProbabilities += [ float(line.split()[2]) for line in open(tempPosteriorProbsFile) ]
            
        sam.close()
        #Write out the substitution info
        node = ET.Element("alignmentUncertainty", { 
                "averagePosteriorMatchProbabilityPerRead":str(self.formatRatio(sum(avgPosteriorMatchProbabilityInCigar), len(avgPosteriorMatchProbabilityInCigar))),
                "averagePosteriorMatchProbability":str(self.formatRatio(float(sum([ avgMatchProb*alignedPairs for avgMatchProb, alignedPairs in zip(avgPosteriorMatchProbabilityInCigar, alignedPairsInCigar) ])),sum(alignedPairsInCigar))),
                "averagePosteriorMatchProbabilitesPerRead":",".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ]), 
                "alignedPairsInCigar":",".join([ str(i) for i in alignedPairsInCigar ]) })
        open(os.path.join(self.outputDir, "alignmentUncertainty.xml"), "w").write(prettyXml(node))
        if len(avgPosteriorMatchProbabilityInCigar) > 0:
            outf = open(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), "w")
            outf.write("\t".join([ str(i) for i in avgPosteriorMatchProbabilityInCigar ])); outf.write("\n")
            outf.close()
            system("Rscript nanopore/analyses/match_hist.R {} {}".format(os.path.join(self.getLocalTempDir(), "tmp_uncertainty"), os.path.join(self.outputDir, "posterior_prob_hist.pdf")))
        #Indicate everything is all done
        self.finish()
Beispiel #2
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName, referenceSequence, querySequenceName, querySequence, outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")
    
    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence) 
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system("echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s" % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma, matchGamma, outputCigarFile))
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) > 0
    assert len([ pA for pA in cigarRead(open(outputCigarFile)) ]) == 1
Beispiel #3
0
def realignCigarTargetFn(target, exonerateCigarString, referenceSequenceName,
                         referenceSequence, querySequenceName, querySequence,
                         outputCigarFile, hmmFile, gapGamma, matchGamma):
    #Temporary files
    tempRefFile = os.path.join(target.getGlobalTempDir(), "ref.fa")
    tempReadFile = os.path.join(target.getGlobalTempDir(), "read.fa")

    #Write the temporary files.
    fastaWrite(tempRefFile, referenceSequenceName, referenceSequence)
    fastaWrite(tempReadFile, querySequenceName, querySequence)

    #Call to cactus_realign
    loadHmm = nameValue("loadHmm", hmmFile)
    system(
        "echo %s | cactus_realign %s %s --diagonalExpansion=10 --splitMatrixBiggerThanThis=3000 %s --gapGamma=%s --matchGamma=%s > %s"
        % (exonerateCigarString, tempRefFile, tempReadFile, loadHmm, gapGamma,
           matchGamma, outputCigarFile))
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) > 0
    assert len([pA for pA in cigarRead(open(outputCigarFile))]) == 1
Beispiel #4
0
def realignSamFile3TargetFn(target, samFile, outputSamFile, tempCigarFiles):
    #Setup input and output sam files
    sam = pysam.Samfile(samFile, "r" )
    
    #Replace the cigar lines with the realigned cigar lines
    outputSam = pysam.Samfile(outputSamFile, "wh", template=sam)
    for aR, tempCigarFile in zip(samIterator(sam), tempCigarFiles): #Iterate on the sam lines realigning them in parallel
        #Load the cigar
        pA = [ i for i in cigarRead(open(tempCigarFile)) ][0]
        
        #Convert to sam line
        aR.cigar = tuple([ (op.type, op.length) for op in pA.operationList ])
        
        #Write out
        outputSam.write(aR)
    
    #Finish up
    sam.close()
    outputSam.close()
Beispiel #5
0
def realignSamFile3TargetFn(target, samFile, outputSamFile, tempCigarFiles):
    #Setup input and output sam files
    sam = pysam.Samfile(samFile, "r")

    #Replace the cigar lines with the realigned cigar lines
    outputSam = pysam.Samfile(outputSamFile, "wh", template=sam)
    for aR, tempCigarFile in zip(
            samIterator(sam), tempCigarFiles
    ):  #Iterate on the sam lines realigning them in parallel
        #Load the cigar
        pA = [i for i in cigarRead(open(tempCigarFile))][0]

        #Convert to sam line
        aR.cigar = tuple([(op.type, op.length) for op in pA.operationList])

        #Write out
        outputSam.write(aR)

    #Finish up
    sam.close()
    outputSam.close()
Beispiel #6
0
 def cigarIterator():
     #Iterates over all the cigars in the temp files.
     for tempCigarFile in tempCigarFiles:
         for pA in cigarRead(open(tempCigarFile)):
             yield pA 
     yield None #This is put in to cause an error if there is fewer 
Beispiel #7
0
 def cigarIterator():
     #Iterates over all the cigars in the temp files.
     for tempCigarFile in tempCigarFiles:
         for pA in cigarRead(open(tempCigarFile)):
             yield pA 
     yield None #This is put in to cause an error if there is fewer