def testCPecanEmMultipleTrials(self): """Runs uns cPecanEm with multiple different trials. """ for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") outputModelXMLFile = os.path.join(tempDir, "outputModel.xml") outputBlastFile = os.path.join(tempDir, "outputBlast.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood trials=3 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, trials=trials, outputTrialHmms=True, iterations=5, randomStart=True, logLevel=getLogLevelString(), optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", outputXMLModelFile=outputModelXMLFile, blastScoringMatrixFile=outputBlastFile) trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ] hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() logger.info("After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % (hmm.likelihood, " ".join(map(lambda x : str(x.likelihood), trialHmms)))) matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix(hmm, ("ACTG",)) logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs)))) self.assertTrue(float(node.attrib["maxLikelihood"]) == hmm.likelihood) #Now use the blast file to compute a new matrix computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile)) #Run modifyHmm to check it works system("cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile)) hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() system("rm -rf %s" % tempDir)
def testCPecanEm(self): """Runs cPecanEm. """ trial = 0 for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"): for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, modelType=modelType, jobTreeDir=jobTreeDir, iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(), setJukesCantorStartingEmissions=0.2, #useDefaultModelAsStart=, trainEmissions=True, tieEmissions=True, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100") hmm = Hmm.loadHmm(outputModelFile) system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree logger.info("For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood)) iterations = 5 runCPecanEm(sequenceFiles=[ seqFile1, seqFile2 ], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, optionsToRealign="--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(), maxAlignmentLengthPerJob=10000) #, updateTheBand=True) hmm2 = Hmm.loadHmm(outputModelFile) logger.info("For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood)) self.assertTrue(hmm.likelihood < hmm2.likelihood) hmm2.normalise() logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions))) logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions))) system("rm -rf %s" % tempDir) trial += 1
def testCPecanEmMultipleTrials(self): """Runs uns cPecanEm with multiple different trials. """ for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") outputModelXMLFile = os.path.join(tempDir, "outputModel.xml") outputBlastFile = os.path.join(tempDir, "outputBlast.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood trials = 3 runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, trials=trials, outputTrialHmms=True, iterations=5, randomStart=True, logLevel=getLogLevelString(), optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", outputXMLModelFile=outputModelXMLFile, blastScoringMatrixFile=outputBlastFile) trialHmms = [ Hmm.loadHmm(outputModelFile + ("_%i" % i)) for i in xrange(trials) ] hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() logger.info( "After multiple trials and iterations of EM the best likelihood found was %s, the likelihoods of the variants were: %s" % (hmm.likelihood, " ".join( map(lambda x: str(x.likelihood), trialHmms)))) matchProbs, gapOpen, gapExtend = makeBlastScoringMatrix( hmm, ("ACTG", )) logger.info("Gap open: %s, Gap extend: %s, Match probs %s" % (gapOpen, gapExtend, " ".join(map(str, matchProbs)))) self.assertTrue( float(node.attrib["maxLikelihood"]) == hmm.likelihood) #Now use the blast file to compute a new matrix computeAlignments(seqFile1, seqFile2, alignmentsFile, lastzArguments=("--ambiguous=iupac --scores=%s" % outputBlastFile)) #Run modifyHmm to check it works system( "cPecanModifyHmm %s %s --gcContent=0.5 --substitutionRate=0.05 --setFlatIndelEmissions" % (outputModelFile, outputModelFile)) hmm = Hmm.loadHmm(outputModelFile) node = ET.parse(outputModelXMLFile).getroot() system("rm -rf %s" % tempDir)
def testCPecanEm(self): """Runs cPecanEm. """ trial = 0 for modelType in ("fiveState", "fiveStateAsymmetric", "threeState", "threeStateAsymmetric"): for seqFile1, seqFile2 in seqFilePairGenerator(): tempDir = getTempDirectory(rootDir=os.getcwd()) jobTreeDir = os.path.join(tempDir, "jobTree") alignmentsFile = os.path.join(tempDir, "alignments.cigars") computeAlignments(seqFile1, seqFile2, alignmentsFile) logger.info("Computed alignments for seqs %s and %s" % (seqFile1, seqFile2)) outputModelFile = os.path.join(tempDir, "outputModel.txt") #First run the script to generate a model and do one iteration of EM to #get the likelihood to compare with the final likelihood runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, modelType=modelType, jobTreeDir=jobTreeDir, iterations=1, trials=1, randomStart=False, logLevel=getLogLevelString(), setJukesCantorStartingEmissions=0.2, #useDefaultModelAsStart=, trainEmissions=True, tieEmissions=True, optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100") hmm = Hmm.loadHmm(outputModelFile) system("rm -rf %s" % jobTreeDir) #Cleanup the old jobTree logger.info( "For trial %s the likelihood after 1 iteration of EM is %s" % (trial, hmm.likelihood)) iterations = 5 runCPecanEm( sequenceFiles=[seqFile1, seqFile2], alignmentsFile=alignmentsFile, outputModelFile=outputModelFile, jobTreeDir=jobTreeDir, optionsToRealign= "--diagonalExpansion=6 --splitMatrixBiggerThanThis=100", iterations=iterations, inputModelFile=outputModelFile, logLevel=getLogLevelString(), maxAlignmentLengthPerJob=10000) #, updateTheBand=True) hmm2 = Hmm.loadHmm(outputModelFile) logger.info( "For trial %s the likelihood after a further %s iterations of EM is %s" % (trial, iterations, hmm2.likelihood)) self.assertTrue(hmm.likelihood < hmm2.likelihood) hmm2.normalise() logger.info("Final transitions: %s" % " ".join(map(str, hmm2.transitions))) logger.info("Final emissions: %s" % " ".join(map(str, hmm2.emissions))) system("rm -rf %s" % tempDir) trial += 1