def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [tempAssemblyFile], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir( os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName()) ) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(), "jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget( MakeStats1(self.outputDir, cactusAlignment, self.options))
def testCactusRealign(self): """Runs cactus realign using the default parameters and checks that the realigned output cigars align the same subsequences. """ for seqFile1, seqFile2 in seqFilePairGenerator(): lastzOutput = getTempFile(rootDir=self.tempDir) runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput, lastzArguments=self.defaultLastzArguments, work_dir=self.tempDir) realignOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignOutput, realignArguments=self.defaultRealignArguments, work_dir=self.tempDir) for realignLine, lastzLine in zip( [i for i in open(lastzOutput, 'r') if i != ''], [i for i in open(realignOutput, 'r') if i != '']): realignCigar = cigarReadFromString(realignLine) lastzCigar = cigarReadFromString(lastzLine) self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
def testCactusRealignRescoreByIdentityAndProb(self): """Runs cactus realign using the default parameters and checks that the realigned output cigars align the same subsequences. """ for seqFile1, seqFile2 in seqFilePairGenerator(): lastzOutput = getTempFile(rootDir=self.tempDir) runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput, lastzArguments=self.defaultLastzArguments, work_dir=self.tempDir) realignByIdentityOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignByIdentityOutput, realignArguments=self.defaultRealignArguments + " --rescoreByIdentity", work_dir=self.tempDir) realignByPosteriorProbOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignByPosteriorProbOutput, realignArguments=self.defaultRealignArguments + " --rescoreByPosteriorProb", work_dir=self.tempDir) realignByIdentityIgnoringGapsOutput = getTempFile( rootDir=self.tempDir) runCactusRealign( seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignByIdentityIgnoringGapsOutput, realignArguments=self.defaultRealignArguments + " --rescoreByIdentityIgnoringGaps", work_dir=self.tempDir) for realignLineByIdentity, realignLineByPosteriorProb, realignLineByIdentityIgnoringGaps, lastzLine in \ zip([ i for i in open(realignByIdentityOutput, 'r') if i != '' ], \ [ i for i in open(realignByPosteriorProbOutput, 'r') if i != '' ], \ [ i for i in open(realignByIdentityIgnoringGapsOutput, 'r') if i != '' ], \ [ i for i in open(lastzOutput, 'r') if i != '' ]): realignCigarByIdentity = cigarReadFromString( realignLineByIdentity) realignCigarByPosteriorProb = cigarReadFromString( realignLineByPosteriorProb) realignCigarByIdentityIgnoringGaps = cigarReadFromString( realignLineByIdentityIgnoringGaps) lastzCigar = cigarReadFromString(lastzLine) #Check scores are as expected self.assertTrue(realignCigarByIdentity.score >= 0) self.assertTrue(realignCigarByIdentity.score <= 100.0) self.assertTrue(realignCigarByPosteriorProb.score >= 0) self.assertTrue(realignCigarByPosteriorProb.score <= 100.0) self.assertTrue(realignCigarByIdentityIgnoringGaps.score >= 0) self.assertTrue( realignCigarByIdentityIgnoringGaps.score <= 100.0)
def down(target, inputFile, fileStart, fileEnd, N, outputFile): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > 0.5: raise RuntimeError( ) #This error is a test error, it does not mean the tests have failed. length = fileEnd - fileStart target.logToMaster( "Am running a down target with length: %i from input file: %s" % (length, inputFile)) assert length >= 0 if length > N: midPoint = getMidPoint(inputFile, fileStart, fileEnd) assert midPoint >= fileStart assert midPoint + 1 < fileEnd #We will subdivide the file tempFile1 = getTempFile(rootDir=target.getGlobalTempDir()) tempFile2 = getTempFile(rootDir=target.getGlobalTempDir()) target.addChildTargetFn( down, (inputFile, fileStart, midPoint + 1, N, tempFile1)) target.addChildTargetFn(down, (inputFile, midPoint + 1, fileEnd, N, tempFile2)) #Add one to avoid the newline target.setFollowOnTargetFn(up, (tempFile1, tempFile2, outputFile)) else: #We can sort this bit of the file copySubRangeOfFile(inputFile, fileStart, fileEnd, outputFile) sort(outputFile)
def run(self): self.logToMaster("Blasting ingroups vs outgroups to file %s" % (self.finalResultsFile)) try: os.makedirs(self.outgroupFragmentsDir) except os.error: # Directory already exists pass if self.ingroupCoverageDir is not None: try: os.makedirs(self.ingroupCoverageDir) except os.error: # Directory already exists pass ingroupResultsFile = getTempFile("ingroupResults", rootDir=self.getGlobalTempDir()) self.addChildTarget(BlastSequencesAllAgainstAll(self.ingroupSequenceFiles, ingroupResultsFile, self.blastOptions)) outgroupResultsFile = getTempFile("outgroupResults", rootDir=self.getGlobalTempDir()) self.setFollowOnTarget(BlastFirstOutgroup(self.ingroupSequenceFiles, self.ingroupSequenceFiles, self.outgroupSequenceFiles, self.outgroupFragmentsDir, ingroupResultsFile, outgroupResultsFile, self.finalResultsFile, self.blastOptions, 1, self.ingroupCoverageDir))
def testFastaReadWriteC(self): """Tests consistency with C version of this function. """ tempFile = getTempFile() self.tempFiles.append(tempFile) tempFile2 = getTempFile() self.tempFiles.append(tempFile2) for test in range(0, self.testNo): fastaNumber = random.choice(range(10)) l = [getRandomSequence() for i in range(fastaNumber)] fileHandle = open(tempFile, 'w') for name, seq in l: fastaWrite(fileHandle, name, seq) fileHandle.close() command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2) print(command) system(command) fileHandle = open(tempFile2, 'r') l.reverse() outFh = io.StringIO() for i in fastaRead(fileHandle): name, seq = i assert i == l.pop() fastaWrite(outFh, name, seq) outFh.close() fileHandle.close()
def testCPecanRealignSplitSequences(self): """Runs cPecanRealign, splitting indels longer than 100bp, and check that the coverage from the results is the same as the coverage from realigning with no arguments..""" for seqFile1, seqFile2 in seqFilePairGenerator(): # Drop the lastz command since it's not needed. But this # is still convenient to use the same parameters as all # the other tests realignCommand, _ = getCommands(seqFile1, seqFile2) splitRealignCommand = realignCommand + " --splitIndelsLongerThanThis 100" realignOutput = getTempFile() splitRealignOutput = getTempFile() realignCommand += " > %s" % realignOutput splitRealignCommand += " > %s" % splitRealignOutput system(realignCommand) system(splitRealignCommand) # Check coverage on seqFile1 #The following will fail until we refactor. splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) # Check coverage on seqFile2 splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) os.remove(realignOutput) os.remove(splitRealignOutput)
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree --outgroupFragmentsDir %s/outgroupFragments --ingroupCoverageDir %s/ingroupCoverages" % (",".join(ingroupPaths), ",".join(outgroupPaths), self.tempOutputFile, self.tempDir, self.tempDir, self.tempDir)) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupPath in outgroupPaths: system("cat %s/outgroupFragments/%s >> %s" % (self.tempDir, os.path.basename(outgroupPath), outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) system("cactus_coverage --from %s %s %s > %s" % (outgroupsCombined, ingroupPath, self.tempOutputFile, independentCoverageFile)) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = os.path.join("%s/ingroupCoverages" % self.tempDir, os.path.basename(ingroupPath) + ".bed") print independentCoverageFile self.assertTrue(os.path.isfile(keptCoverageFile)) self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def testCactusRealignDummy(self): """Runs cactus realign using the "rescoreOriginalAlignment" mode and checks the output is equivalent to what you'd get by just running lastz. """ for seqFile1, seqFile2 in seqFilePairGenerator(): lastzOutput = getTempFile(rootDir=self.tempDir) runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput, lastzArguments=self.defaultLastzArguments) realignOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignOutput, realignArguments=self.defaultRealignArguments + " --rescoreOriginalAlignment") for realignLine, lastzLine in zip( [i for i in open(lastzOutput, 'r') if i != ''], [i for i in open(realignOutput, 'r') if i != '']): realignCigar = cigarReadFromString(realignLine) lastzCigar = cigarReadFromString(lastzLine) self.assertTrue(realignCigar != None) self.assertTrue(realignCigar == lastzCigar)
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) calculateCoverage(fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def down(target, inputFile, fileStart, fileEnd, N, outputFile): """Input is a file and a range into that file to sort and an output location in which to write the sorted file. If the range is larger than a threshold N the range is divided recursively and a follow on job is then created which merges back the results else the file is sorted and placed in the output. """ if random.random() > 0.5: raise RuntimeError() #This error is a test error, it does not mean the tests have failed. length = fileEnd - fileStart target.logToMaster("Am running a down target with length: %i from input file: %s" % (length, inputFile)) assert length >= 0 if length > N: midPoint = getMidPoint(inputFile, fileStart, fileEnd) assert midPoint >= fileStart assert midPoint+1 < fileEnd #We will subdivide the file tempFile1 = getTempFile(rootDir=target.getGlobalTempDir()) tempFile2 = getTempFile(rootDir=target.getGlobalTempDir()) target.addChildTargetFn(down, (inputFile, fileStart, midPoint+1, N, tempFile1)) target.addChildTargetFn(down, (inputFile, midPoint+1, fileEnd, N, tempFile2)) #Add one to avoid the newline target.setFollowOnTargetFn(up, (tempFile1, tempFile2, outputFile)) else: #We can sort this bit of the file copySubRangeOfFile(inputFile, fileStart, fileEnd, outputFile) sort(outputFile)
def testKeepingCoverageOnIngroups(self): """Tests whether the --ingroupCoverageDir option works as advertised.""" encodeRegion = "ENm001" ingroups = ["human", "cow"] outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run blast in "ingroup vs outgroups" mode, requesting to keep # the bed files that show outgroup coverage on the ingroup. toilDir = os.path.join(self.tempDir, "tmp_toil") outgroupFragmentPaths = [getTempFile(rootDir=self.tempDir) for outgroup in outgroups] ingroupCoveragePaths = [getTempFile(rootDir=self.tempDir) for ingroup in ingroups] runCactusBlastIngroupsAndOutgroups(ingroups=ingroupPaths, outgroups=outgroupPaths, alignmentsFile=self.tempOutputFile, outgroupFragmentPaths=outgroupFragmentPaths, ingroupCoveragePaths=ingroupCoveragePaths, toilDir=toilDir) for i, ingroupPath in enumerate(ingroupPaths): # Get the coverage from the outgroups independently and # check that it's the same as the file in # ingroupCoverageDir otherIngroupPath = ingroupPaths[1] if i == 0 else ingroupPaths[0] # To filter out alignments from the other ingroup and # self-alignments we need to create a fasta with all the # outgroup fragments in it. outgroupsCombined = getTempFile(rootDir=self.tempDir) for outgroupFragmentPath in outgroupFragmentPaths: system("cat %s >> %s" % (outgroupFragmentPath, outgroupsCombined)) independentCoverageFile = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, fromGenome=outgroupsCombined, sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=independentCoverageFile) # find the coverage file cactus_blast kept (should be # named according to the basename of the ingroup path # file) keptCoverageFile = ingroupCoveragePaths[i] self.assertTrue(filecmp.cmp(independentCoverageFile, keptCoverageFile))
def testCactusRealignSplitSequences(self): """Runs cactus realign, splitting indels longer than 100bp, and check that the coverage from the results is the same as the coverage from realigning with no arguments..""" for seqFile1, seqFile2 in seqFilePairGenerator(): lastzOutput = getTempFile(rootDir=self.tempDir) runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput, lastzArguments=self.defaultLastzArguments) realignOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=realignOutput, realignArguments=self.defaultRealignArguments) splitRealignOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile=lastzOutput, outputAlignmentsFile=splitRealignOutput, realignArguments=self.defaultRealignArguments + " --splitIndelsLongerThanThis 100") # Check coverage on seqFile1 splitRealignCoverage = runCactusCoverage(seqFile1, splitRealignOutput) realignCoverage = runCactusCoverage(seqFile1, realignOutput) self.assertTrue(splitRealignCoverage == realignCoverage) # Check coverage on seqFile2 splitRealignCoverage = runCactusCoverage(seqFile2, splitRealignOutput) realignCoverage = runCactusCoverage(seqFile2, realignOutput) self.assertTrue(splitRealignCoverage == realignCoverage) os.remove(realignOutput) os.remove(splitRealignOutput)
def testMatchGraph(self): """ Tests matchGraph.py program using randGraph.py input """ for test in range(self.testNo): tempInputFile = getTempFile() tempOutputFile = getTempFile() self.tempFiles.append(tempInputFile) self.tempFiles.append(tempOutputFile) # Create sample/test input graph file system("blossom_randGraph.py > %s" % tempInputFile) # Run matchGraph.py system("matchGraph.py -e %s -w %s" % (tempInputFile, tempOutputFile)) # Now check if output is valid f = open(tempOutputFile, 'r') lineIdx = 0 for line in f: line = line.rstrip() if lineIdx == 0: (vertexNum, edgeNum) = line.split() vertexNum = int(vertexNum) edgeNum = int(edgeNum) vertexArray = [0] * vertexNum # Number of vertices must be even self.assertEqual(vertexNum % 2, 0) # Number of edges is half the number of vertices self.assertEqual(vertexNum/2, edgeNum) else: (vertexI, vertexJ,) = line.split() vertexI = int(vertexI) vertexJ = int(vertexJ) vertexArray[vertexI] += 1 vertexArray[vertexJ] += 1 # Vertex indices must be 0<= i,j < V self.assertTrue(vertexI in range(vertexNum)) self.assertTrue(vertexJ in range(vertexNum)) lineIdx += 1 # Must have the correct number of edges self.assertEqual(edgeNum, lineIdx-1) badCount = 0 for i in vertexArray: if i != 1: badCount += 1 # Each vertex must be only in one edge self.assertEqual(badCount, 0) logger.info("Ran the test(s) of the matchGraph program okay")
def testBlossom(self): """ Tests blossom5 program using randGraph.py input """ for test in xrange(self.testNo): tempInputFile = getTempFile() tempOutputFile = getTempFile() self.tempFiles.append(tempInputFile) self.tempFiles.append(tempOutputFile) # Create sample/test input graph file system("blossom_randGraph.py > %s" % tempInputFile) # Run blossom5 system("blossom5 -e %s -w %s >& /dev/null" % (tempInputFile, tempOutputFile)) # Now check if output is valid f = open(tempOutputFile, 'r') lineIdx = 0 for line in f: line = line.rstrip() if lineIdx == 0: (vertexNum, edgeNum) = line.split() vertexNum = int(vertexNum) edgeNum = int(edgeNum) vertexArray = [0] * vertexNum # Number of vertices must be even self.assertEqual(vertexNum % 2, 0) # Number of edges is half the number of vertices self.assertEqual(vertexNum/2, edgeNum) else: (vertexI, vertexJ,) = line.split() vertexI = int(vertexI) vertexJ = int(vertexJ) vertexArray[vertexI] += 1 vertexArray[vertexJ] += 1 # Vertex indices must be 0<= i,j < V self.assertTrue(vertexI in xrange(vertexNum)) self.assertTrue(vertexJ in xrange(vertexNum)) lineIdx += 1 # Must have the correct number of edges self.assertEqual(edgeNum, lineIdx-1) badCount = 0 for i in vertexArray: if i != 1: badCount += 1 # Each vertex must be only in one edge self.assertEqual(badCount, 0) logger.info("Ran the test(s) of the blossom program okay")
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegion = "ENm001" ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] MAX_NUM_OUTGROUPS = 3 # subselect a random set of outgroups in the same order outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1, len(outgroups) + 1): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def liftover(self, bedLine): """Lift a bedLine over to the target genome, parse the PSL output, and return a map from target sequence -> [(query block, [target block(s)])] Blocks are (start, end, strand) where start < end """ tempSrc = getTempFile("ContiguousRegions.tempSrc.bed", rootDir=self.tempRoot) tempDest = getTempFile("ContiguousRegions.tempDest.psl", rootDir=self.tempRoot) open(tempSrc, 'w').write("%s\n" % bedLine) cmd = "halLiftover --outPSL %s %s %s %s %s" % (self.alignment, self.srcGenome, tempSrc, self.destGenome, tempDest) bioio.system(cmd) pslLines = open(tempDest).read().split("\n") os.remove(tempSrc) os.remove(tempDest) pslLines = map(lambda x: x.split(), pslLines) # Get target blocks for every query block. All adjacencies # within a block are by definition preserved. Adjacencies # between target blocks (and query blocks with the commandline # option) are what determine if the structure is preserved. # dict is to keep blocks separated by target sequence & strand blocks = defaultdict(list) for pslLine in pslLines: if pslLine == []: continue qStrand = pslLine[8][0] assert(qStrand == '+') if len(pslLine[8]) != 1: assert(len(pslLine[8]) == 2) tStrand = pslLine[8][1] else: tStrand = '+' tName = pslLine[13] tSize = int(pslLine[14]) blockSizes = [int(i) for i in pslLine[18].split(",") if i != ''] qStarts = [int(i) for i in pslLine[19].split(",") if i != ''] tStarts = [int(i) for i in pslLine[20].split(",") if i != ''] assert(len(blockSizes) == len(qStarts) and len(qStarts) == len(tStarts)) for blockLen, qStart, tStart in zip(blockSizes, qStarts, tStarts): qBlock = (qStart, qStart + blockLen, qStrand) tBlock = (tStart, tStart + blockLen, tStrand) if tStrand == '+' else (tSize - tStart - blockLen, tSize - tStart, tStrand) blocks[tName].append((qBlock, tBlock)) # Sort & merge query blocks in cases of duplication return self.mergeBlocks(blocks)
def liftover(self, bedLine): """Lift a bedLine over to the target genome, parse the PSL output, and return a map from target sequence -> [(query block, [target block(s)])] Blocks are (start, end, strand) where start < end """ tempSrc = getTempFile("ContiguousRegions.tempSrc.bed", rootDir=self.tempRoot) tempDest = getTempFile("ContiguousRegions.tempDest.psl", rootDir=self.tempRoot) open(tempSrc, 'w').write("%s\n" % bedLine) cmd = "halLiftover --outPSL %s %s %s %s %s" % (self.alignment, self.srcGenome, tempSrc, self.destGenome, tempDest) bioio.system(cmd) pslLines = open(tempDest).read().split("\n") os.remove(tempSrc) os.remove(tempDest) pslLines = [x.split() for x in pslLines] # Get target blocks for every query block. All adjacencies # within a block are by definition preserved. Adjacencies # between target blocks (and query blocks with the commandline # option) are what determine if the structure is preserved. # dict is to keep blocks separated by target sequence & strand blocks = defaultdict(list) for pslLine in pslLines: if pslLine == []: continue qStrand = pslLine[8][0] assert(qStrand == '+') if len(pslLine[8]) != 1: assert(len(pslLine[8]) == 2) tStrand = pslLine[8][1] else: tStrand = '+' tName = pslLine[13] tSize = int(pslLine[14]) blockSizes = [int(i) for i in pslLine[18].split(",") if i != ''] qStarts = [int(i) for i in pslLine[19].split(",") if i != ''] tStarts = [int(i) for i in pslLine[20].split(",") if i != ''] assert(len(blockSizes) == len(qStarts) and len(qStarts) == len(tStarts)) for blockLen, qStart, tStart in zip(blockSizes, qStarts, tStarts): qBlock = (qStart, qStart + blockLen, qStrand) tBlock = (tStart, tStart + blockLen, tStrand) if tStrand == '+' else (tSize - tStart - blockLen, tSize - tStart, tStrand) blocks[tName].append((qBlock, tBlock)) # Sort & merge query blocks in cases of duplication return self.mergeBlocks(blocks)
def testRepeatBed(self): tempFile = getTempFile(rootDir=os.getcwd()) tempFile2 = getTempFile(rootDir=os.getcwd()) fileHandle = open(tempFile, 'w') fileHandle.write(">hello boo\nacTGACCCCgtcgAAcAAccc\n>foo\nAaaAAAAAAA") fileHandle.close() system("getRepeatBed %s %s" % (tempFile, tempFile2)) fileHandle = open(tempFile2, 'r') fn = lambda (i, j, k) : (i, int(j), int(k)) j = [ fn(i.split()) for i in fileHandle.readlines() ] print j assert j == [ ("hello", 0, 2), ("hello", 9, 13), ("hello", 15, 16), ("hello", 18, 21), ("foo", 1, 3) ] os.remove(tempFile) os.remove(tempFile2)
def testCopySubRangeOfFile(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) outputFile = getTempFile(rootDir=tempDir) makeFileToSort(tempFile) fileSize = os.path.getsize(tempFile) assert fileSize > 0 fileStart = random.choice(xrange(0, fileSize)) fileEnd = random.choice(xrange(fileStart, fileSize)) copySubRangeOfFile(tempFile, fileStart, fileEnd, outputFile) l = open(outputFile, 'r').read() l2 = open(tempFile, 'r').read()[fileStart:fileEnd] checkEqual(l, l2) system("rm -rf %s" % tempDir)
def testInvariants(self): (seqs, _) = getCactusInputs_encode(random.uniform(0, 2)) # Chimp encode input has duplicate header names. seqs = [i for i in seqs if 'chimp' not in i] seqs = random.sample(seqs, 2) cigarPath = getTempFile() cactus_call(parameters=[ "cPecanLastz", "--format=cigar", "%s[multiple]" % seqs[0], "%s[multiple]" % seqs[1] ], outfile=cigarPath) bed = cactus_call(parameters=["cactus_coverage", seqs[1], cigarPath], check_output=True) prevChrom = None prevStart = None prevEnd = None # Check that everything is sorted and there are no overlaps for line in bed.split("\n"): line.strip() if line == "": continue fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) self.assertTrue(end - start >= 1) if chrom == prevChrom: self.assertTrue(start > prevStart) self.assertTrue(start >= prevEnd) os.remove(cigarPath)
def run(self): # If the files are in a sub-dir then rip them out. if os.path.isdir(self.inputSequenceFileOrDirectory): tempFile = getTempFile(rootDir=self.getGlobalTempDir()) catFiles( [ os.path.join(self.inputSequenceFileOrDirectory, f) for f in os.listdir(self.inputSequenceFileOrDirectory) ], tempFile, ) inputSequenceFile = tempFile else: inputSequenceFile = self.inputSequenceFileOrDirectory assert inputSequenceFile != self.outputSequenceFile prepXmlElems = self.configNode.findall("preprocessor") analysisString = runCactusAnalyseAssembly(inputSequenceFile) self.logToMaster( "Before running any preprocessing on the assembly: %s got following stats (assembly may be listed as temp file if input sequences from a directory): %s" % (self.inputSequenceFileOrDirectory, analysisString) ) if len(prepXmlElems) == 0: # Just cp the file to the output file system("cp %s %s" % (inputSequenceFile, self.outputSequenceFile)) else: logger.info("Adding child batch_preprocessor target") self.addChildTarget(BatchPreprocessor(prepXmlElems, inputSequenceFile, self.outputSequenceFile, 0))
def setup(target, inputFile, N): """Sets up the sort. """ tempOutputFile = getTempFile(rootDir=target.getGlobalTempDir()) target.addChildTargetFn( down, (inputFile, 0, os.path.getsize(inputFile), N, tempOutputFile)) target.setFollowOnFn(cleanup, (tempOutputFile, inputFile))
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None, logLevel=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree() newick = NXNewick().writeString(tree) f.write('%s\n' % newick) for genome in eW.getGenomesWithSequence(): f.write('%s %s\n' % (genome, eW.getSequenceID(genome))) config = eW.getConfigPath() runCactusProgressive(seqFile, config, toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats, logLevel=logLevel)
def getFastaDict(self): temp = getTempFile(rootDir=self.getGlobalTempDir()) system("hal2fasta %s %s > %s" % (self.halPath, self.genome, temp)) ret = {} for header, seq in fastaRead(temp): ret[header] = seq return ret
def getRandomConfigFile(): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse(os.path.join(cactusRootPath(), "cactus_config.xml")).getroot() cafNode = config.find("caf") assert len(config.findall("caf")) == 1 annealingRounds = 1 + int(random.random() * 10) cafNode.attrib["annealingRounds"] = " ".join([ str(1 + int(random.random() * 10)) for i in xrange(annealingRounds) ]) deannealingRounds = list(set([ 1 + int(random.random() * 10) for i in xrange(int(random.random() * 10)) ])) deannealingRounds.sort() cafNode.attrib["deannealingRounds"] = " ".join([ str(i) for i in deannealingRounds ]) cafNode.attrib["trim"] = " ".join([ str(1 + int(random.random() * 5)) for i in xrange(annealingRounds) ]) cafNode.attrib["alignRepeatsAtLoop"] = str(random.random() * annealingRounds) cafNode.attrib["minimumTreeCoverage"] = str(random.random()) cafNode.attrib["blockTrim"] = str(int(random.random() * 5)) cafNode.attrib["ignoreAllChainsLessThanMinimumTreeCoverage"] = str(random.choice([0, 1])) cafNode.attrib["minimumBlockDegree"] = str(random.choice([0, 5])) checkNode = config.find("check") checkNode.attrib["runCheck"] = "1" checkNode = config.find("normal") checkNode.attrib["iterations"] = "2" #Now print the file.. fileHandle = open(tempConfigFile, 'w') ET.ElementTree(config).write(fileHandle) fileHandle.close() if getLogLevelString() == "DEBUG": system("cat %s" % tempConfigFile) return tempConfigFile
def testInvariants(self): (seqs, _) = getCactusInputs_encode(random.uniform(0, 2)) # Chimp encode input has duplicate header names. seqs = [i for i in seqs if 'chimp' not in i] seqs = random.sample(seqs, 2) cigarPath = getTempFile() system("cPecanLastz --format=cigar %s[multiple] %s[multiple] > %s" % \ (seqs[0], seqs[1], cigarPath)) bed = popenCatch("cactus_coverage %s %s" % (seqs[1], cigarPath)) prevChrom = None prevStart = None prevEnd = None # Check that everything is sorted and there are no overlaps for line in bed.split("\n"): line.strip() if line == "": continue fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) self.assertTrue(end - start >= 1) if chrom == prevChrom: self.assertTrue(start > prevStart) self.assertTrue(start >= prevEnd) os.remove(cigarPath)
def wrap(self): # Pretty much ripped from the toil worker.py setup. tempPath = getTempFile() oldStdout = os.dup(1) oldStderr = os.dup(2) #Open the file to send stdout/stderr to. logFh = os.open(tempPath, os.O_RDWR | os.O_CREAT | os.O_APPEND) #Replace standard output with a descriptor for the log file os.dup2(logFh, 1) #Replace standard error with a descriptor for the log file os.dup2(logFh, 2) try: fn(self) except: oldStdoutFile = os.fdopen(oldStdout, 'w') logFile = os.fdopen(os.dup(logFh)) logFile.seek(0) oldStdoutFile.write(logFile.read()) raise finally: # Close the descriptor we used to open the file os.close(logFh) # Reset stdout and stderr os.dup2(oldStdout, 1) os.dup2(oldStderr, 2) os.remove(tempPath)
def run(self): speciesTree = popenCatch("halStats --tree %s" % (self.opts.halFile)).strip() chromSizes = getChromSizes(self.opts.halFile, self.opts.refGenome) positions = [] # For ensuring that a column isn't counted multiple times from # different reference positions. positionSet = set(positions) for i in xrange(self.opts.numSamples): # Have to sample the columns here since otherwise it can # be difficult to independently seed several RNGs pos = samplePosition(chromSizes) if pos not in positionSet: positions.append(pos) positionSet.add(pos) outputs = [] for sliceStart in xrange(0, self.opts.numSamples, self.opts.samplesPerJob): slice = positions[sliceStart:sliceStart + self.opts.samplesPerJob] outputFile = getTempFile(rootDir=self.getGlobalTempDir()) outputs.append(outputFile) self.addChildTarget(ScoreColumns(self.opts, slice, outputFile, speciesTree, positionSet)) self.setFollowOnTarget(Summarize(self.opts, outputs, self.opts.outputFile, self.opts.writeMismatchesToFile))
def scriptTree_SortTest(testNo, batchSystem, lines=10000, maxLineLength=10, N=10000): """Tests scriptTree/jobTree by sorting a file in parallel. """ for test in xrange(testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) jobTreeDir = os.path.join(tempDir, "testJobTree") makeFileToSort(tempFile, lines=lines, maxLineLength=maxLineLength) #First make our own sorted version fileHandle = open(tempFile, 'r') l = fileHandle.readlines() l.sort() fileHandle.close() #Sort the file while True: command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %i --batchSystem %s --jobTime 1.0 --maxCpus 20 --retryCount 2" % (jobTreeDir, tempFile, N, batchSystem) #, retryCount) system(command) try: system("jobTreeStatus --jobTree %s --failIfNotComplete" % jobTreeDir) break except: print "The jobtree failed and will be restarted" #raise RuntimeError() continue #Now check the file is properly sorted.. #Now get the sorted file fileHandle = open(tempFile, 'r') l2 = fileHandle.readlines() fileHandle.close() checkEqual(l, l2) system("rm -rf %s" % tempDir)
def testMerge(self): for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile1 = getTempFile(rootDir=tempDir) tempFile2 = getTempFile(rootDir=tempDir) tempFile3 = getTempFile(rootDir=tempDir) makeFileToSort(tempFile1) makeFileToSort(tempFile2) sort(tempFile1) sort(tempFile2) merge(tempFile1, tempFile2, tempFile3) lines1 = loadFile(tempFile1) + loadFile(tempFile2) lines1.sort() lines2 = loadFile(tempFile3) checkEqual(lines1, lines2) system("rm -rf %s" % tempDir)
def parasolRestart(): """Function starts the parasol hub and node. """ parasolStop() while True: machineList = os.path.join(workflowRootPath(), "jobTree", "machineList") #pathEnvVar = os.environ["PATH"] os.system("paraNode start -hub=localhost") #-umask=002 -userPath=%s -sysPath=%s" % (pathEnvVar, pathEnvVar)) os.system("paraHub %s subnet=127.0.0 &" % (machineList,)) tempFile = getTempFile() dead = True try: popen("parasol status", tempFile) fileHandle = open(tempFile, 'r') line = fileHandle.readline() while line != '': if "Nodes dead" in line: print line if int(line.split()[-1]) == 0: dead = False line = fileHandle.readline() fileHandle.close() except RuntimeError: pass os.remove(tempFile) if not dead: break else: logger.info("Tried to restart the parasol process, but failed, will try again") parasolStop() time.sleep(5) logger.info("Restarted the parasol process")
def killMasterAndParasol(): """Method to destroy master process """ tempFile = getTempFile() popen("ps -a", tempFile) fileHandle = open(tempFile, 'r') line = fileHandle.readline() #Example parasol state lines: #67401 ttys002 0:00.06 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i #67403 ttys002 0:00.65 /Users/benedictpaten/kent/src/parasol/bin/paraHub -log=/tmp/hub.2009-07-08.log machineList subnet=127.0.0 #68573 ttys002 0:00.00 /Users/benedictpaten/kent/src/parasol/bin/paraNode start -hub=localhost -log=/tmp/node.2009-07-08.log -umask=002 -userPath=bin:bin/x86_64:bin/i while line != '': tokens = line.split() if 'paraNode' in line or 'paraHub' in line: if random.random() > 0.5: i = os.system("kill %i" % int(tokens[0])) logger.info("Tried to kill parasol process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i)) break elif 'jobTreeMaster.py' in line: logger.info("Have job tree master line") if random.random() > 0.5: i = os.system("kill %i" % int(tokens[0])) logger.info("Tried to kill master process: %i, line: %s, exit value: %i" % (int(tokens[0]), line, i)) break line = fileHandle.readline() fileHandle.close() os.remove(tempFile) parasolRestart()
def run(self): cactusAlignmentName = "cactusAlignment" cactusAlignment = os.path.join(self.outputDir, cactusAlignmentName) if not os.path.exists(cactusAlignment): #Prepare the assembly #First copy it. if self.assemblyFile[-3:] == '.gz': tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix=".gz") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) system("gunzip %s" % tempAssemblyFile) tempAssemblyFile = tempAssemblyFile[:-3] assert os.path.exists(tempAssemblyFile) else: tempAssemblyFile = getTempFile(rootDir=self.getLocalTempDir(), suffix="") system("cp %s %s" % (self.assemblyFile, tempAssemblyFile)) #Make the supporting temporary files tempExperimentFile = getTempFile(rootDir=self.getLocalTempDir()) tempJobTreeDir = os.path.join(self.getLocalTempDir(), "jobTree") #Make the experiment file cactusWorkflowExperiment = ExperimentWrapper.createExperimentWrapper( sequences=self.haplotypeSequences + [ tempAssemblyFile ], newickTreeString=self.newickTree, outputDir=self.getLocalTempDir(), configFile=self.configFile) cactusWorkflowExperiment.setDbName(cactusAlignmentName) cactusWorkflowExperiment.setDbDir(os.path.join(self.getLocalTempDir(), cactusWorkflowExperiment.getDbName())) #This needs to be set to ensure the thing gets put in the right directory cactusWorkflowExperiment.writeXML(tempExperimentFile) #Now run cactus workflow runCactusWorkflow(experimentFile=tempExperimentFile, jobTreeDir=tempJobTreeDir, buildAvgs=False, buildReference=True, batchSystem="single_machine", maxThreads=1, jobTreeStats=True) logger.info("Ran the workflow") #Check if the jobtree completed sucessively. runJobTreeStatusAndFailIfNotComplete(tempJobTreeDir) logger.info("Checked the job tree dir") #Compute the stats cactusAlignmentDir = os.path.join(self.getLocalTempDir(), cactusAlignmentName) tempJobTreeStatsFile = os.path.join(self.getLocalTempDir(),"jobTreeStats.xml") system("jobTreeStats --jobTree %s --outputFile %s" % (tempJobTreeDir, tempJobTreeStatsFile)) #Now copy the true assembly back to the output system("mv %s/* %s" % (self.getLocalTempDir(), self.outputDir)) #system("mv %s %s/config.xml" % (tempExperimentFile, self.outputDir)) #system("mv %s %s/" % (tempJobTreeStatsFile, self.outputDir)) #system("mv %s %s/" % (cactusAlignmentDir, self.outputDir)) assert os.path.exists(cactusAlignment) #We're done! self.addChildTarget(MakeStats1(self.outputDir, cactusAlignment, self.options))
def main(): parser = getBasicOptionParser("usage: %prog [options]", "%prog 0.1") parser.add_option("--job", dest="jobFile", help="Job file containing command to run", default="None") parser.add_option("--treePointer", dest="treePointerFile", help="File containing pointer to the tree data", default="None") options, args = parseBasicOptions(parser) logger.info("Parsed the input arguments") job = ET.parse(options.jobFile).getroot() setLogLevel(job.attrib["log_level"]) logger.info("Parsed the job XML") treePointer = ET.parse(options.treePointerFile).getroot() logger.info("Parsed the tree pointer XML") tree = ET.parse(treePointer.attrib["file"]).getroot() logger.info("Parsed the tree XML") for child in tree.find("children").findall("child"): #Make the chuld tree pointer childTreePointerFile = makeTreePointer(child.attrib["file"], getTempFile(rootDir=job.attrib["global_temp_dir"])) #Make the child command unbornChild = ET.SubElement(job.find("children"), "child") command = "jobTreeTest_CommandFirst.py --treePointer %s --job JOB_FILE" % \ (childTreePointerFile,) unbornChild.attrib["command"] = command if random.random() > 0.2: unbornChild.attrib["time"] = str(random.random() * 10) #Make the child tree pointer ET.SubElement(treePointer.find("children"), "child", { "file":childTreePointerFile }) job.attrib["command"] = "jobTreeTest_CommandSecond.py --treePointer %s --job JOB_FILE" % \ (options.treePointerFile,) logger.info("Made new command") fileHandle = open(options.jobFile, 'w') ET.ElementTree(job).write(fileHandle) fileHandle.close() logger.info("Updated the job file") print >>sys.stderr, "Checking that we can report to std err" #These lines should end up in the logs print "Checking that we can report to std out" if random.random() > 0.9: logger.info("Going to fail the job") sys.exit(1) logger.info("Going to pass the job done okay") sys.exit(0)
def getCactusInputs_randomWithConstraints(regionNumber=0, tempDir=None): sequenceDirs, newickTreeString = getCactusInputs_random(regionNumber=regionNumber, tempDir=tempDir) constraints = getTempFile(rootDir=tempDir) fileHandle = open(constraints, 'w') for pairwiseAlignment in makeRandomConstraints(getFastasFromSequence(sequenceDirs)): cigarWrite(fileHandle, pairwiseAlignment, withProbs=False) fileHandle.close() return sequenceDirs, newickTreeString, constraints
def getConfigFile(matchingAlgorithm="greedy"): tempConfigFile = getTempFile(rootDir="./", suffix=".xml") config = ET.parse( os.path.join(cactusRootPath(), "cactus_progressive_config.xml")).getroot() config.find("reference").attrib["matching_algorithm"] = matchingAlgorithm ET.ElementTree(config).write(tempConfigFile) return os.path.abspath(tempConfigFile)
def sortCigarByContigAndPos(cigarPath, contigNum): contigNameKey = 2 if contigNum == 1 else 6 startPosKey = 3 if contigNum == 1 else 7 tempFile = getTempFile() system("sort -k %d,%d -k %d,%dn %s > %s" % (contigNameKey, contigNameKey, startPosKey, startPosKey, cigarPath, tempFile)) return tempFile
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildReference, buildHal, buildFasta, toilStats, subtreeRoot=None): tempDir = getTempDirectory(os.getcwd()) tempExperimentDir = os.path.join(tempDir, "exp") runCreateMultiCactusProject(experimentFile, tempExperimentDir, fixNames=False, root=subtreeRoot) logger.info("Put the temporary files in %s" % tempExperimentDir) runCactusProgressive(os.path.join(tempExperimentDir, "exp_project.xml"), toilDir, batchSystem=batchSystem, buildAvgs=buildAvgs, toilStats=toilStats) # Check that the headers and sequences in the output are the # same as the sequences in the input (minus differences in # repeat-masking) exp = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqMap = exp.buildSequenceMap() # Maps genome name -> headers in fasta headers = {} for genomeName, inputSequencePath in seqMap.items(): if os.path.isdir(inputSequencePath): # Some "input sequence paths" are actually provided as # directories containing multiple FASTAs concatenatedPath = getTempFile() system("cat %s/* > %s" % (inputSequencePath, concatenatedPath)) inputSequencePath = concatenatedPath headers[genomeName] = list(map(itemgetter(0), fastaRead(inputSequencePath))) # check headers inside .c2h output for expPath in glob.glob('%s/*/*_experiment.xml' % (tempExperimentDir)): subExp = ExperimentWrapper(ET.parse(expPath).getroot()) outgroups = subExp.getOutgroupEvents() c2hPath = subExp.getHALPath() with open(c2hPath) as f: for line in f: fields = line.split('\t') if fields[0] == 's': # Sequence line genome = fields[1][1:-1] header = fields[2][1:-1] if genome in headers and genome not in outgroups: # This genome is an input genome self.assertTrue(header in headers[genome], 'Header %s from output c2h %s not found in input fa %s' ' for genome %s' % (header, c2hPath, seqMap[genome], genome)) runToilStatusAndFailIfNotComplete(toilDir) system("rm -rf %s" % tempDir)
def setUp(self): unittest.TestCase.setUp(self) # simple test data -- not an actual alignment, but to test if # coverage is correct. no overlap on B, but overlap on A. self.simpleFastaPathA = getTempFile() open(self.simpleFastaPathA, 'w').write( dedent('''\ >id=0|simpleSeqA1 otherTokens thatDon'tMatter ACTAGAGTAGGAGAGAGAGGGGGG CATGCATGCATGCATGCATGCATG >id=1|simpleSeqA2 otherTokens thatDon'tMatter AAAAAAAAAAAAAAAACTCGTGAG CATGCATGCATGCATGCATGCATG''')) self.simpleFastaPathB = getTempFile() open(self.simpleFastaPathB, 'w').write( dedent('''\ >id=2|simpleSeqB1 otherTokens CATGCATGCATGCATGCATGCATG CATGCATGCATGCATGCATGCATG''')) self.simpleFastaPathC = getTempFile() open(self.simpleFastaPathC, 'w').write( dedent('''\ >id=3|simpleSeqC1 otherTokens thatDon'tMatter CATGCATGCATGCATGCATGCATG CATGCATGCATGCATGCATGCATG''')) self.simpleFastaPathD = getTempFile() open(self.simpleFastaPathD, 'w').write( dedent('''\ >id=4|simpleSeqD otherTokens thatDon'tMatter CATGCATGCATGCATGCATGCATG CATGCATGCATGCATGCATGCATG''')) self.simpleCigarPath = getTempFile() open(self.simpleCigarPath, 'w').write( dedent('''\ cigar: id=2|simpleSeqB1 0 9 + id=0|simpleSeqA1 10 0 - 0 M 8 D 1 M 1 cigar: id=2|simpleSeqB1 9 18 + id=0|simpleSeqA1 2 6 + 0 M 3 I 5 M 1 cigar: id=2|simpleSeqB1 18 28 + id=1|simpleSeqA2 0 10 + 0 M 1 I 2 M 2 D 2 M 5 cigar: id=2|simpleSeqB1 28 30 + id=1|simpleSeqA2 6 8 + 0 M 2 cigar: id=2|simpleSeqB1 30 32 + id=1|simpleSeqA2 7 9 + 0 M 2 cigar: id=12|simpleSeqZ1 0 1 + id=0|simpleSeqA1 6 7 + 0 M 1 cigar: id=3|simpleSeqC1 0 5 + id=4|simpleSeqD 0 5 + 0 M 5 cigar: id=4|simpleSeqD 5 10 + id=3|simpleSeqC1 5 10 + 0 M 5 cigar: id=3|simpleSeqC1 10 15 + id=3|simpleSeqC1 15 20 + 0 M 5 cigar: id=303|simpleSeqNonExistent 0 10 + id=3|simpleSeqC1 0 10 + 0 M 10 '''))
def testCactusRealign(self): """Runs cactus realign using the default parameters and checks that the realigned output cigars align the same subsequences. """ for seqFile1, seqFile2 in seqFilePairGenerator(): lastzOutput = getTempFile(rootDir=self.tempDir) runLastz(seqFile1, seqFile2, alignmentsFile=lastzOutput, lastzArguments=self.defaultLastzArguments) realignOutput = getTempFile(rootDir=self.tempDir) runCactusRealign(seqFile1, seqFile2, inputAlignmentsFile = lastzOutput, outputAlignmentsFile = realignOutput, realignArguments=self.defaultRealignArguments) for realignLine, lastzLine in zip([ i for i in open(lastzOutput, 'r') if i != '' ], [ i for i in open(realignOutput, 'r') if i != '' ]): realignCigar = cigarReadFromString(realignLine) lastzCigar = cigarReadFromString(lastzLine) self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
def run(self): length = self.fileEnd - self.fileStart self.logToMaster("Am running a down target with length: %i from input file: %s" % (length, self.inputFile)) assert length >= 0 if length > self.N: midPoint = getMidPoint(self.inputFile, self.fileStart, self.fileEnd) assert midPoint >= self.fileStart assert midPoint+1 < self.fileEnd #We will subdivide the file tempFile1 = getTempFile(rootDir=self.getGlobalTempDir()) tempFile2 = getTempFile(rootDir=self.getGlobalTempDir()) self.addChildTarget(Down(self.inputFile, self.fileStart, midPoint+1, self.N, tempFile1)) self.addChildTarget(Down(self.inputFile, midPoint+1, self.fileEnd, self.N, tempFile2)) #Add one to avoid the newline self.setFollowOnTarget(Up(tempFile1, tempFile2, self.outputFile)) else: #We can sort this bit of the file copySubRangeOfFile(self.inputFile, self.fileStart, self.fileEnd, self.outputFile) sort(self.outputFile)
def run(self): outputsPerGenome = {} for genome, bedFile in self.bedFileDict.items(): outputsPerGenome[genome] = [] numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bedFile)) linesPerJob = int(math.ceil(float(numLines) / self.jobsPerGenome)) if linesPerJob == 0: linesPerJob = 1 for start in xrange(0, numLines, linesPerJob): end = start + linesPerJob if end > numLines: end = numLines bedForJob = getTempFile(rootDir=self.getGlobalTempDir()) system("head -n %d %s | tail -n %d > %s" % (start + linesPerJob, bedFile, end - start, bedForJob)) output = getTempFile(rootDir=self.getGlobalTempDir()) self.addChildTarget(RunAncestorsML(self.halFile, genome, bedForJob, self.phyloPModel, output)) outputsPerGenome[genome].append(output) self.setFollowOnTarget(WriteNucleotides(outputsPerGenome, self.halFile))
def writeSequenceData(target, genome, hal, hubDir): """Write the .2bit and chrom.sizes for a genome.""" if not os.path.isdir(os.path.join(hubDir, genome)): os.makedirs(os.path.join(hubDir, genome)) fasta = getTempFile() system("hal2fasta %s %s > %s" % (hal, genome, fasta)) system("faToTwoBit %s %s" % (fasta, os.path.join(hubDir, genome, genome + '.2bit'))) system("twoBitInfo %s %s" % (os.path.join(hubDir, genome, genome + '.2bit'), os.path.join(hubDir, genome, 'chrom.sizes'))) os.remove(fasta)
def runReferenceMedianProblemTest(medianHistory, greedyIterations, theta): """Runs the reference problem for a given median history """ #Make adjacencies stubNumber = 2 nodeNumber = len( medianHistory.getMedianGenome().getElements()) * 2 + stubNumber weights = {} for genome in medianHistory.getLeafGenomes(): for node1, node2, distance in genome.getTransitiveAdjacencies(): if (node1, node2) in weights: weights[(node1, node2)] += weightFn(distance, theta) else: weights[(node1, node2)] = weightFn(distance, theta) def translateLeftSideOfElementToNode(element): assert element != 0 if element < 0: return abs(element) * 2 return element * 2 + 1 def translateLeftNodeToElement(node): assert node >= stubNumber assert node < nodeNumber element = node / 2 if (node % 2) == 0: element *= -1 return element #Now print out the input = "%i\t%i\t%i\t%i\t%s" % ( greedyIterations, nodeNumber, stubNumber, len(weights.keys()), "\t".join([ "%i\t%i\t%f" % (translateLeftSideOfElementToNode(-node1), translateLeftSideOfElementToNode(node2), weights[(node1, node2)]) for (node1, node2) in weights.keys() ])) tempPath = getTempFile() with open(tempPath, 'w') as tempFile: tempFile.write(input) #Command command = os.path.join( os.path.split( os.path.abspath( matchingAndOrdering.tests.simulatedGenome.__file__))[0], "testBin", "referenceMedianProblemTest2") output = popenCatch(command + " < %s" % tempPath) os.remove(tempPath) medianChromosome = Chromosome() for adjacency in output.split(): medianChromosome.append(translateLeftNodeToElement(int(adjacency))) medianGenome = Genome(chromosomeNumber=0, elementNumber=0) medianGenome.addChromosome(medianChromosome) assert medianGenome.getElements() == medianHistory.getMedianGenome( ).getElements() return medianGenome
def testJobTreeStats_SortSimple(self): """Tests the jobTreeStats utility using the scriptTree_sort example. """ for test in xrange(self.testNo): tempDir = getTempDirectory(os.getcwd()) tempFile = getTempFile(rootDir=tempDir) outputFile = getTempFile(rootDir=tempDir) jobTreeDir = os.path.join(tempDir, "jobTree") lines=100000 maxLineLength=10 N=1000 makeFileToSort(tempFile, lines, maxLineLength) #Sort the file command = "scriptTreeTest_Sort.py --jobTree %s --logLevel=DEBUG --fileToSort=%s --N %s --stats --jobTime 0.5" % (jobTreeDir, tempFile, N) system(command) #Now get the stats system("jobTreeStats --jobTree %s --outputFile %s" % (jobTreeDir, outputFile)) #Cleanup system("rm -rf %s" % tempDir)
def makeRunnable(self, tempDir): pickleFile = getTempFile(".pickle", tempDir) fileHandle = open(pickleFile, 'w') cPickle.dump(self, fileHandle, cPickle.HIGHEST_PROTOCOL) fileHandle.close() i = set() for importString in self.target.importStrings: i.add(importString) classNames = " ".join(i) return "scriptTree %s %s" % (pickleFile, classNames)
def testRandom(self): """Makes random sequences and tests that Ortheus can align them and produce a valid output. """ outputFile = getTempFile() self.tempFiles.append(outputFile) MAX_SEQS = 20 for i in xrange(MAX_SEQS): self.tempFiles.append(getTempFile()) for test in xrange(0, self.testNo): print "test no : %i " % test #seqNo binaryTree = randomTree() middleSeq = getRandomSequence(250)[1] seqs = [] getTreeSeqs(binaryTree, middleSeq, seqs) if len(seqs) <= MAX_SEQS and len(seqs) > 2: seqFiles = [] for i in xrange(0, len(seqs)): seqFiles.append(self.tempFiles[1 + i]) fileHandle = open(seqFiles[i], 'w') fastaWrite(fileHandle, "%i" % i, seqs[i]) fileHandle.close() print "Have seq files ", seqFiles treeString = printBinaryTree(binaryTree, True) print "For tree ", treeString #align seqs and check no failure command = "ortheus_core -a %s -b '%s' -d %s -e" % ( " ".join(seqFiles), treeString, outputFile) print "command to call", command system(command) #check alignment is complete alignment = [i[:] for i in fastaAlignmentRead(outputFile)] #print "alignment", alignment checkAlignment(alignment, seqs) print "test no is finished : %i " % test
def progressiveFunction(self, experimentFile, toilDir, batchSystem, buildAvgs, buildHal, buildFasta, toilStats, subtreeRoot=None): eW = ExperimentWrapper(ET.parse(experimentFile).getroot()) seqFile = getTempFile() with open(seqFile, 'w') as f: tree = eW.getTree()
def runScript(self, binaryName, outputFile, specialOptions): if not os.path.exists(outputFile): tempOutputFile = getTempFile(rootDir=self.getLocalTempDir()) os.remove(tempOutputFile) system( "%s --cactusDisk '%s' --outputFile %s --minimumNsForScaffoldGap %s --sampleNumber %s %s" % (os.path.join(getRootPathString(), "bin", binaryName), getCactusDiskString(self.alignment), tempOutputFile, self.options.minimumNsForScaffoldGap, self.options.sampleNumber, specialOptions)) system("mv %s %s" % (tempOutputFile, outputFile))
def testCactusCallPipes(self): inputFile = getTempFile(rootDir=self.tempDir) with open(inputFile, 'w') as f: f.write('foobar\n') # using 'cat' here rather than infile is intentional; it tests # whether the directory is mounted into containers correctly. output = cactus_call(parameters=[['cat', inputFile], ['sed', 's/foo/baz/g'], ['awk', '{ print "quux" $0 }']], check_output=True) self.assertEqual(output, 'quuxbazbar\n')