def runAsMedianMedianProblemTest(medianHistory): """Runs AsMedian, requires to be installed. I got it from: https://sites.google.com/site/andrewweixu/Home/software/asmedian """ #Dump to disk tempFile = os.path.join(os.getcwd(), "simulatedGenomeTempFile.txt") fileHandle = open(tempFile, 'w') fileHandle.write(medianHistory.getLeafGenomeString()) fileHandle.close() #-cp /Users/benedictpaten/Desktop/ASMedian-1.0 popenCatch("java BIOMedian %s" % tempFile) os.remove(tempFile) #Parse in fileHandle = open(tempFile + ".rst", 'r') input = fileHandle.readlines() fileHandle.close() os.remove(tempFile + ".rst") asMedianMedianGenome = Genome(chromosomeNumber=0, elementNumber=0) for line in input[1:]: if line[0] == '#': break asMedianChromosome = Chromosome() for element in line.split()[1:]: asMedianChromosome.append(int(element)) asMedianMedianGenome.addChromosome(asMedianChromosome) return asMedianMedianGenome
def testCPecanRealignSplitSequences(self): """Runs cPecanRealign, splitting indels longer than 100bp, and check that the coverage from the results is the same as the coverage from realigning with no arguments..""" for seqFile1, seqFile2 in seqFilePairGenerator(): # Drop the lastz command since it's not needed. But this # is still convenient to use the same parameters as all # the other tests realignCommand, _ = getCommands(seqFile1, seqFile2) splitRealignCommand = realignCommand + " --splitIndelsLongerThanThis 100" realignOutput = getTempFile() splitRealignOutput = getTempFile() realignCommand += " > %s" % realignOutput splitRealignCommand += " > %s" % splitRealignOutput system(realignCommand) system(splitRealignCommand) # Check coverage on seqFile1 #The following will fail until we refactor. splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) # Check coverage on seqFile2 splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, splitRealignOutput)) realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, realignOutput)) self.assertTrue(splitRealignCoverage == realignCoverage) os.remove(realignOutput) os.remove(splitRealignOutput)
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile, toilDir=os.path.join(self.tempDir, "setVsSetToil"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil")) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered) coverageSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered)) coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered) coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered)) print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments)) coverageFileSetVsSet = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet) coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileSetVsSet)) outgroupAlignments = getTempFile(rootDir=self.tempDir) system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments)) coverageFileInVsOut = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut) coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total += $3 - $2} END { print total }'" % coverageFileInVsOut)) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
def testCPecanRealign(self): """Runs cPecanRealign using the default parameters and checks that the realigned output cigars align the same subsequences. """ for seqFile1, seqFile2 in seqFilePairGenerator(): realignCommand, lastzCommand = getCommands(seqFile1, seqFile2) for realignLine, lastzLine in zip([ i for i in popenCatch(realignCommand).split("\n") if i != '' ], [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]): realignCigar = cigarReadFromString(realignLine) lastzCigar = cigarReadFromString(lastzLine) self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
def testCPecanRealignDummy(self): """Runs cPecanRealign using the "rescoreOriginalAlignment" mode and checks the output is equivalent to what you'd get by just running lastz. """ for seqFile1, seqFile2 in seqFilePairGenerator(): realignCommand, lastzCommand = getCommands(seqFile1, seqFile2, "--rescoreOriginalAlignment") for realignLine, lastzLine in zip([ i for i in popenCatch(realignCommand).split("\n") if i != '' ], [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]): realignCigar = cigarReadFromString(realignLine) lastzCigar = cigarReadFromString(lastzLine) self.assertTrue(realignCigar != None) self.assertTrue(realignCigar == lastzCigar)
def getBedLineForSequence(halFile, genome, sequence, start, length): """Get a bed line from the beginning to the end of a given sequence. If start and length are None, the full sequence is returned, otherwise only the given region is. """ bedLines = popenCatch("halStats --bedSequences %s %s" % (genome, halFile)).split("\n") seqLines = filter(lambda x: x[0] == sequence, [line.split() for line in bedLines if line != ""]) if len(seqLines) > 1: raise RuntimeError("More than one sequence named %s in genome %s, " "aborting!" % (sequence, genome)) elif len(seqLines) == 0: raise RuntimeError("No sequence named %s found in genome %s" % (sequence, genome)) if start is None and length is None: return "\t".join(seqLines[0]) elif start is not None and length is not None: if start + length > int(seqLines[0][2]): raise RuntimeError("Selected region runs off end of sequence.") seqLines[0][1] = str(start) seqLines[0][2] = str(start + length) return "\t".join(seqLines[0]) else: raise RuntimeError("Both start and length must be provided.")
def runCactusReference(cactusDiskDatabaseString, flowerNames, logLevel=None, matchingAlgorithm=None, referenceEventString=None, permutations=None, useSimulatedAnnealing=None, theta=None, phi=None, maxWalkForCalculatingZ=None, ignoreUnalignedGaps=None, wiggle=None, numberOfNs=None, minNumberOfSequencesToSupportAdjacency=None, makeScaffolds=None): """Runs cactus reference. """ logLevel = getLogLevelString2(logLevel) matchingAlgorithm = nameValue("matchingAlgorithm", matchingAlgorithm) referenceEventString = nameValue("referenceEventString", referenceEventString) permutations = nameValue("permutations", permutations, int) useSimulatedAnnealing = nameValue("useSimulatedAnnealing", useSimulatedAnnealing, bool) theta = nameValue("theta", theta, float) phi = nameValue("phi", phi, float) maxWalkForCalculatingZ = nameValue("maxWalkForCalculatingZ", maxWalkForCalculatingZ, int) ignoreUnalignedGaps = nameValue("ignoreUnalignedGaps", ignoreUnalignedGaps, bool) wiggle = nameValue("wiggle", wiggle, float) numberOfNs = nameValue("numberOfNs", numberOfNs, int) minNumberOfSequencesToSupportAdjacency = nameValue("minNumberOfSequencesToSupportAdjacency", minNumberOfSequencesToSupportAdjacency, int) makeScaffolds = nameValue("makeScaffolds", makeScaffolds, bool) command = "cactus_reference --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s" % \ (cactusDiskDatabaseString, logLevel, matchingAlgorithm, referenceEventString, permutations, useSimulatedAnnealing, theta, phi, maxWalkForCalculatingZ, ignoreUnalignedGaps, wiggle, numberOfNs, minNumberOfSequencesToSupportAdjacency, makeScaffolds) masterMessages = popenCatch(command, stdinString=flowerNames) logger.info("Ran cactus_reference okay") return [ i for i in masterMessages.split("\n") if i != '' ]
def get_chromosomes(hal, ref_genome): """ Returns a set of chromosomes present in the reference genome. """ sizes = popenCatch('halStats {} --chromSizes {}'.format(hal, ref_genome)) sizes = sizes.split("\n")[:-1] # last line is empty newline return {x.split()[0] for x in sizes}
def test_ancestral_nodes(target, region_specific_conserved, accelerated_genomes, maf_path, region_bed, outf_handle): """Run LRT test on each ancestral node below the common ancestor of the accelerated genomes""" for anc_name, branch_model in rename_model(target, region_specific_conserved, accelerated_genomes): assert len(anc_name) > 0, (anc_name, branch_model, region_specific_conserved, accelerated_genomes) region_specific_accelerated = os.path.join( target.getGlobalTempDir(), 'region_specific_accelerated') cmd = 'phyloFit --init-model {} --scale-subtree {}:loss --out-root {} {}' cmd = cmd.format(branch_model, anc_name, region_specific_accelerated, maf_path) region_specific_accelerated += '.mod' system(cmd) cmd = 'phastOdds --output-bed --features {} --background-mods {} --feature-mods {} {}' cmd = cmd.format(region_bed, branch_model, region_specific_accelerated, maf_path) r = popenCatch(cmd) l = r.split() # discard the result if the test is not positive if int(l[-1]) > 0: l[-2] = anc_name outf_handle.write('\t'.join(l) + '\n')
def sanityCheckSequence(self, path): """Warns the user about common problems with the input sequences.""" # Relies on cactus_analyseAssembly output staying in the # format it's currently in. cmdline = "cactus_analyseAssembly" if os.path.isdir(path): cmdline = "cat %s/* | %s -" % (path, cmdline) else: cmdline += " %s" % path output = popenCatch(cmdline) # We don't do error-checking here, all we'll get is a prettier # error message and it will be pretty obvious what's going on # (i.e. the analyseAssembly output will have changed) repeatMaskedFrac = float(re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1)) nFrac = float(re.search(r'ProportionNs: ([0-9.]*)', output).group(1)) # These thresholds are pretty arbitrary, but should be good for # badly- to well-assembled vertebrate genomes. if repeatMaskedFrac > 0.70: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of masked bases: %f. progressiveCactus" " expects a soft-masked genome, i.e. all lowercase" " characters are considered masked. The process " "will proceed normally, but make sure you haven't " "accidentally provided an all-lowercase genome, " "in which case nothing will be aligned to " "it!\n\n" % (path, repeatMaskedFrac)) if nFrac > 0.30: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of 'N' bases: %f. The process will " "proceed normally, but make sure your genome " "isn't hard-masked! Alignments to hard-masked " "genomes are much worse than to soft-masked " "genomes. If the genome just has a lot of " "poorly assembled regions, feel free to " "ignore this message.\n\n" % (path, nFrac))
def getChunks(self, sequenceFiles, chunksDir): return [ chunk for chunk in popenCatch("cactus_blast_chunkSequences %s %i %i %s %s" % \ (getLogLevelString(), self.blastOptions.chunkSize, self.blastOptions.overlapSize, chunksDir, " ".join(sequenceFiles))).split("\n") if chunk != "" ]
def run(self): logger.info("Preparing sequence for preprocessing") # chunk it up inChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksIn")) inChunkList = [ chunk for chunk in popenCatch( "cactus_blast_chunkSequences %s %i 0 %s %s" % (getLogLevelString(), self.prepOptions.chunkSize, inChunkDirectory, self.inSequencePath) ).split("\n") if chunk != "" ] outChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksOut")) outChunkList = [] # For each input chunk we create an output chunk, it is the output chunks that get concatenated together. for i in xrange(len(inChunkList)): outChunkList.append(os.path.join(outChunkDirectory, "chunk_%i" % i)) # Calculate the number of chunks to use inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample))) assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0 # Now get the list of chunks flanking and including the current chunk j = max(0, i - inChunkNumber / 2) inChunks = inChunkList[j : j + inChunkNumber] if len(inChunks) < inChunkNumber: # This logic is like making the list circular inChunks += inChunkList[: inChunkNumber - len(inChunks)] assert len(inChunks) == inChunkNumber self.addChildTarget( PreprocessChunk( self.prepOptions, inChunks, float(inChunkNumber) / len(inChunkList), inChunkList[i], outChunkList[i] ) ) # follow on to merge chunks self.setFollowOnTarget(MergeChunks(self.prepOptions, outChunkList, self.outSequencePath))
def run_augustus(target, hint_f, seq_f, name, start, stop, aln_start, aln_stop, cfg_version, cfg_path, out_file_tree): """ Runs Augustus for each cfg/gp_string pair. """ cmd = augustus_cmd.format(fasta=seq_f, start=start, stop=stop, cfg=cfg_path, hints=hint_f) r = popenCatch(cmd) r = r.split("\n") # extract only the transcript lines l = [x.split() for x in r if "\ttranscript\t" in x] # filter out transcripts that do not overlap the alignment range transcripts = [ x[-1] for x in l if not (int(x[4]) < start or int(x[3]) > stop) ] # if we lose everything, stop here if len(transcripts) > 0: # rename transcript based on cfg version, and make names unique name_map = rename_transcripts(transcripts, cfg_version, name) # write this to a shared location where we will combine later out_path = out_file_tree.getTempFile() write_augustus(r, name_map, out_path, start)
def __getSeqInfo(self, faPaths, event): cmdLine = "cactus_analyseAssembly" for faPath in faPaths: if not os.path.isfile(faPath): raise RuntimeError("Unable to open sequence file %s" % faPath) cmdLine += " %s" % faPath isCandidate = False if self.candidateSet is not None and event in self.candidateSet: isCandidate = True analyseOutput = popenCatch(cmdLine).split() tsIdx = analyseOutput.index("Total-sequences:") assert tsIdx >= 0 and tsIdx < len(analyseOutput) - 1 numSequences = int(analyseOutput[tsIdx + 1]) tlIdx = analyseOutput.index("Total-length:") assert tlIdx >= 0 and tlIdx < len(analyseOutput) - 1 totalLength = int(analyseOutput[tlIdx + 1]) nsIdx = analyseOutput.index("ProportionNs:") assert nsIdx >= 0 and nsIdx < len(analyseOutput) - 1 nsPct = float(analyseOutput[nsIdx + 1]) rmIdx = analyseOutput.index("Proportion-repeat-masked:") assert rmIdx >= 0 and rmIdx < len(analyseOutput) - 1 rmPct = float(analyseOutput[rmIdx + 1]) assert rmPct <= 1. and rmPct >= 0. n50Idx = analyseOutput.index("N50:") assert n50Idx >= 0 and n50Idx < len(analyseOutput) - 1 n50 = int(analyseOutput[n50Idx + 1]) if isCandidate is True: totalLength *= self.candidateBoost n50 *= self.candidateBoost umLength = max(0, totalLength * (1. - nsPct)) umN50 = max(0, n50 * (1. - nsPct)) return self.SeqInfo(numSequences, totalLength, umLength, n50, umN50)
def _run_evolver_decomposed_no_outgroup(self, binariesMode): """ Run just the mouse-rat alignment. Inspired by issues arising here https://github.com/ComparativeGenomicsToolkit/cactus/pull/216 https://github.com/ComparativeGenomicsToolkit/cactus/pull/217 """ out_dir = os.path.join(self.tempDir, 'output') out_seqfile = os.path.join(out_dir, 'evolverMammalsOut.txt') in_seqfile = os.path.join(self.tempDir, 'evolverMammalsIn.txt') with open(in_seqfile, 'w') as inseq: inseq.write('(simMouse_chr6:0.084509,simRat_chr6:0.091589);\n') inseq.write('simMouse_chr6 http://s3-us-west-2.amazonaws.com/jcarmstr-misc/testRegions/evolverMammals/simMouse.chr6\n') inseq.write('simRat_chr6 http://s3-us-west-2.amazonaws.com/jcarmstr-misc/testRegions/evolverMammals/simRat.chr6\n') cmd = ['cactus-prepare', in_seqfile, '--outDir', out_dir, '--outSeqFile', out_seqfile, '--outHal', self._out_hal(binariesMode), '--jobStore', self._job_store(binariesMode)] job_plan = popenCatch(' '.join(cmd)) for line in job_plan.split('\n'): line = line.strip() if len(line) > 0 and not line.startswith('#'): # todo interface in prepare if line.startswith('cactus-'): line += ' --binariesMode {}'.format(binariesMode) if binariesMode == 'docker': line += ' --latest' if line.startswith('cactus-align'): #Remove all the id prefixes to pretend the cigars came not cactus-blast subprocess.check_call('sed -i -e \'s/id=[0,1]|//g\' {}/Anc0.cigar*'.format(out_dir), shell=True) line += ' --nonBlastInput' sys.stderr.write('Running {}'.format(line)) subprocess.check_call(line, shell=True)
def runCactusFlowerStats(cactusDiskDatabaseString, flowerName, logLevel=None): """Prints stats for the given flower """ logLevel = getLogLevelString2(logLevel) flowerStatsString = popenCatch("cactus_workflow_flowerStats %s '%s' %s" % (logLevel, cactusDiskDatabaseString, flowerName)) return flowerStatsString.split("\n")[0]
def _run_evolver_decomposed(self, name): """ Run the full evolver test, putting the jobstore and output in tempDir but instead of doing in in one shot like above, use cactus-prepare, cactus-blast and cactus-align to break it into different steps """ out_dir = os.path.join(self.tempDir, 'output') out_seqfile = os.path.join(out_dir, 'evolverMammalsOut.txt') in_seqfile = './examples/evolverMammals.txt' cmd = [ 'cactus-prepare', in_seqfile, '--outDir', out_dir, '--outSeqFile', out_seqfile, '--outHal', self._out_hal(name), '--jobStore', self._job_store(name) ] job_plan = popenCatch(' '.join(cmd)) for line in job_plan.split('\n'): line = line.strip() if len(line) > 0 and not line.startswith('#'): # do Anc2 in binariesMode docker to broaden test coverage if 'Anc2' in line and line.startswith('cactus-'): line += ' --binariesMode docker --latest' sys.stderr.write('Running {}'.format(line)) subprocess.check_call(line, shell=True)
def run(self): speciesTree = popenCatch("halStats --tree %s" % (self.opts.halFile)).strip() chromSizes = getChromSizes(self.opts.halFile, self.opts.refGenome) positions = [] # For ensuring that a column isn't counted multiple times from # different reference positions. positionSet = set(positions) for i in xrange(self.opts.numSamples): # Have to sample the columns here since otherwise it can # be difficult to independently seed several RNGs pos = samplePosition(chromSizes) if pos not in positionSet: positions.append(pos) positionSet.add(pos) outputs = [] for sliceStart in xrange(0, self.opts.numSamples, self.opts.samplesPerJob): slice = positions[sliceStart:sliceStart + self.opts.samplesPerJob] outputFile = getTempFile(rootDir=self.getGlobalTempDir()) outputs.append(outputFile) self.addChildTarget(ScoreColumns(self.opts, slice, outputFile, speciesTree, positionSet)) self.setFollowOnTarget(Summarize(self.opts, outputs, self.opts.outputFile, self.opts.writeMismatchesToFile))
def testInvariants(self): (seqs, _) = getCactusInputs_encode(random.uniform(0, 2)) # Chimp encode input has duplicate header names. seqs = [i for i in seqs if 'chimp' not in i] seqs = random.sample(seqs, 2) cigarPath = getTempFile() system("cPecanLastz --format=cigar %s[multiple] %s[multiple] > %s" % \ (seqs[0], seqs[1], cigarPath)) bed = popenCatch("cactus_coverage %s %s" % (seqs[1], cigarPath)) prevChrom = None prevStart = None prevEnd = None # Check that everything is sorted and there are no overlaps for line in bed.split("\n"): line.strip() if line == "": continue fields = line.split() chrom = fields[0] start = int(fields[1]) end = int(fields[2]) self.assertTrue(end - start >= 1) if chrom == prevChrom: self.assertTrue(start > prevStart) self.assertTrue(start >= prevEnd) os.remove(cigarPath)
def testFromC(self): # Test "--from" filtering by filtering for only alignments # from/to D on C. bed = popenCatch("cactus_coverage %s %s --from %s" % (self.simpleFastaPathC, self.simpleCigarPath, self.simpleFastaPathD)) self.assertEqual(bed, dedent('''\ id=3|simpleSeqC1\t0\t10\t\t1 '''))
def runCactusBar(cactusDiskDatabaseString, flowerNames, logLevel=None, spanningTrees=None, maximumLength=None, gapGamma=None, matchGamma=None, splitMatrixBiggerThanThis=None, anchorMatrixBiggerThanThis=None, repeatMaskMatrixBiggerThanThis=None, diagonalExpansion=None, constraintDiagonalTrim=None, minimumBlockDegree=None, minimumIngroupDegree=None, minimumOutgroupDegree=None, alignAmbiguityCharacters=None, pruneOutStubAlignments=None, useProgressiveMerging=None, calculateWhichEndsToComputeSeparately=None, largeEndSize=None, endAlignmentsToPrecomputeOutputFile=None, precomputedAlignments=None, ingroupCoverageFile=None, minimumSizeToRescue=None, minimumCoverageToRescue=None, minimumNumberOfSpecies=None): """Runs cactus base aligner. """ logLevel = getLogLevelString2(logLevel) maximumLength = nameValue("maximumLength", maximumLength, int) spanningTrees = nameValue("spanningTrees", spanningTrees, int) gapGamma = nameValue("gapGamma", gapGamma, float) matchGamma = nameValue("matchGamma", matchGamma, float) splitMatrixBiggerThanThis=nameValue("splitMatrixBiggerThanThis", splitMatrixBiggerThanThis, int) anchorMatrixBiggerThanThis=nameValue("anchorMatrixBiggerThanThis", anchorMatrixBiggerThanThis, int) repeatMaskMatrixBiggerThanThis=nameValue("repeatMaskMatrixBiggerThanThis", repeatMaskMatrixBiggerThanThis, int) diagonalExpansion=nameValue("diagonalExpansion", diagonalExpansion, int) constraintDiagonalTrim = nameValue("constraintDiagonalTrim", constraintDiagonalTrim, int) minimumBlockDegree = nameValue("minimumDegree", minimumBlockDegree, int) minimumIngroupDegree = nameValue("minimumIngroupDegree", minimumIngroupDegree, int) minimumOutgroupDegree = nameValue("minimumOutgroupDegree", minimumOutgroupDegree, int) pruneOutStubAlignments = nameValue("pruneOutStubAlignments", pruneOutStubAlignments, bool) alignAmbiguityCharacters = nameValue("alignAmbiguityCharacters", alignAmbiguityCharacters, bool) useProgressiveMerging=nameValue("useProgressiveMerging", useProgressiveMerging, bool) calculateWhichEndsToComputeSeparately=nameValue("calculateWhichEndsToComputeSeparately", calculateWhichEndsToComputeSeparately, bool) largeEndSize=nameValue("largeEndSize", largeEndSize, int) endAlignmentsToPrecomputeOutputFile=nameValue("endAlignmentsToPrecomputeOutputFile", endAlignmentsToPrecomputeOutputFile, str) precomputedAlignments=nameValue("precomputedAlignments", precomputedAlignments, str, quotes=True) ingroupCoverageFile = nameValue("ingroupCoverageFile", ingroupCoverageFile, str, quotes=True) minimumSizeToRescue = nameValue("minimumSizeToRescue", minimumSizeToRescue, int) minimumCoverageToRescue = nameValue("minimumCoverageToRescue", minimumCoverageToRescue, float) minimumNumberOfSpecies = nameValue("minimumNumberOfSpecies", minimumNumberOfSpecies, int) masterMessages = popenCatch("cactus_bar --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s" % (cactusDiskDatabaseString, logLevel, spanningTrees, maximumLength, gapGamma, matchGamma, splitMatrixBiggerThanThis, anchorMatrixBiggerThanThis, repeatMaskMatrixBiggerThanThis, constraintDiagonalTrim, minimumBlockDegree, minimumIngroupDegree, minimumOutgroupDegree, alignAmbiguityCharacters, pruneOutStubAlignments, diagonalExpansion, useProgressiveMerging, calculateWhichEndsToComputeSeparately, largeEndSize, endAlignmentsToPrecomputeOutputFile, precomputedAlignments, ingroupCoverageFile, minimumSizeToRescue, minimumCoverageToRescue, minimumNumberOfSpecies), stdinString=flowerNames) logger.info("Ran cactus_bar okay") return [ i for i in masterMessages.split("\n") if i != '' ]
def testSimpleCoverageOnB(self): # Genome B bed = popenCatch("cactus_coverage %s %s" % (self.simpleFastaPathB, self.simpleCigarPath)) self.assertEqual(bed, dedent('''\ id=2|simpleSeqB1\t0\t12\t\t1 id=2|simpleSeqB1\t17\t19\t\t1 id=2|simpleSeqB1\t21\t32\t\t1 '''))
def percentCoverage(sequenceFile, coverageFile): """Get the % coverage of a sequence from a coverage file.""" sequenceLen = sequenceLength(sequenceFile) if sequenceLen == 0: return 0 coverage = popenCatch("awk '{ total += $3 - $2 } END { print total }' %s" % coverageFile) if coverage.strip() == '': # No coverage lines return 0 return 100*float(coverage)/sequenceLen
def get_masked_bases(ftp_url): if ftp_url == '': return 0 paths = ftp_url.split('/') rm_out_url = ftp_url + '/' + paths[-1] + '_rm.out.gz' output = popenCatch( "curl -s %s | gzip -d | sed '1,3d' | awk '{total += $7 - $6} END { print total }'" % rm_out_url) return int(output)
def splitBed(bed, numParts): """Split up a bed file by lines into N parts, return the paths of the split files""" numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bed)) # Random suffix so two runs on the same file don't collide suffix = "".join( [random.choice(string.ascii_uppercase) for _ in xrange(7)]) system("split -l %d %s %s.temp.%s" % (math.ceil(float(numLines) / numParts), bed, bed, suffix)) return glob('%s.temp.%s*' % (bed, suffix))
def main(): parser = ArgumentParser(description=__doc__) parser.add_argument('hal', help='hal file') parser.add_argument('refGenome', help='reference genome') parser.add_argument('halTreeMutationsDir', help='the directory output by halTreeMutations.py') parser.add_argument( '--targets', help='target genomes (comma-separated), default: all leaves') parser.add_argument('outputDir', help='output directory for reference beds') opts = parser.parse_args() # Get the species tree from the hal file. newickTree = popenCatch('halStats --tree %s' % (opts.hal)) tree = NXNewick().parseString(newickTree) # Set the target genomes to be all leaves (minus the reference) if not otherwise directed. leafGenomes = [tree.getName(x) for x in tree.getLeaves()] if opts.refGenome not in leafGenomes: raise ValueError("Reference genome %s is not a leaf genome." % opts.refGenome) if opts.targets is None: opts.targets = [x for x in leafGenomes if x != opts.refGenome] else: opts.targets = opts.targets.split(',') if not all([x in leafGenomes for x in opts.targets]): raise ValueError("Some target genomes are not leaves.") try: os.makedirs(opts.outputDir) except: if not os.path.isdir(opts.outputDir): raise for target in opts.targets: refID = getTreeID(tree, opts.refGenome) targetID = getTreeID(tree, target) mrca = getMRCA(tree, refID, targetID) pathToTarget = getPath(opts.hal, opts.refGenome, target) pathUp, pathDown = [ list(v) for k, v in groupby( pathToTarget, lambda x: x == tree.getName(mrca)) if k != True ] bedForTarget = os.path.join(opts.outputDir, target + '.bed') # First, walk up the tree to the MRCA. for curGenome in pathUp: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget, reversePolarity=True) # Next, walk down the tree to the target. for curGenome in pathDown: liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome, opts.refGenome, bedForTarget)
def runReferenceMedianProblemTest(medianHistory, greedyIterations, theta): """Runs the reference problem for a given median history """ #Make adjacencies stubNumber = 2 nodeNumber = len( medianHistory.getMedianGenome().getElements()) * 2 + stubNumber weights = {} for genome in medianHistory.getLeafGenomes(): for node1, node2, distance in genome.getTransitiveAdjacencies(): if (node1, node2) in weights: weights[(node1, node2)] += weightFn(distance, theta) else: weights[(node1, node2)] = weightFn(distance, theta) def translateLeftSideOfElementToNode(element): assert element != 0 if element < 0: return abs(element) * 2 return element * 2 + 1 def translateLeftNodeToElement(node): assert node >= stubNumber assert node < nodeNumber element = node / 2 if (node % 2) == 0: element *= -1 return element #Now print out the input = "%i\t%i\t%i\t%i\t%s" % ( greedyIterations, nodeNumber, stubNumber, len(weights.keys()), "\t".join([ "%i\t%i\t%f" % (translateLeftSideOfElementToNode(-node1), translateLeftSideOfElementToNode(node2), weights[(node1, node2)]) for (node1, node2) in weights.keys() ])) tempPath = getTempFile() with open(tempPath, 'w') as tempFile: tempFile.write(input) #Command command = os.path.join( os.path.split( os.path.abspath( matchingAndOrdering.tests.simulatedGenome.__file__))[0], "testBin", "referenceMedianProblemTest2") output = popenCatch(command + " < %s" % tempPath) os.remove(tempPath) medianChromosome = Chromosome() for adjacency in output.split(): medianChromosome.append(translateLeftNodeToElement(int(adjacency))) medianGenome = Genome(chromosomeNumber=0, elementNumber=0) medianGenome.addChromosome(medianChromosome) assert medianGenome.getElements() == medianHistory.getMedianGenome( ).getElements() return medianGenome
def getChromSizes(halPath, genome): """Get a dictionary of (chrom name):(chrom size) from a hal file.""" output = popenCatch("halStats --chromSizes %s %s" % (genome, halPath)) ret = {} for line in output.split("\n"): fields = line.split("\t") if len(fields) != 2: continue ret[fields[0]] = int(fields[1]) return ret
def testDepthByIDOnB(self): # Genome B using depthByID: should be the same as normal # except for 30-31, where it should be 2 bed = popenCatch("cactus_coverage --depthById %s %s" % ( self.simpleFastaPathB, self.simpleCigarPath)) self.assertEqual(bed, dedent('''\ id=2|simpleSeqB1\t0\t12\t\t1 id=2|simpleSeqB1\t17\t19\t\t1 id=2|simpleSeqB1\t21\t32\t\t1 '''))
def run(self): chunksDir = makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks")) chunks = [ chunk for chunk in popenCatch("cactus_blast_chunkFlowerSequences %s '%s' %s %i %i %i %s" % \ (getLogLevelString(), self.cactusDisk, self.flowerName, self.blastOptions.chunkSize, self.blastOptions.overlapSize, self.blastOptions.minimumSequenceLength, chunksDir)).split("\n") if chunk != "" ] logger.info("Broken up the flowers into individual 'chunk' files") self.addChildTarget(MakeBlastsAllAgainstAll(self.blastOptions, chunks, self.finalResultsFile))
def testProgressiveOutgroupsVsAllOutgroups(self): """Tests the difference in outgroup coverage on an ingroup when running in "ingroups vs. outgroups" mode and "set against set" mode. """ encodeRegion = "ENm001" ingroup = "human" outgroups = ["macaque", "rabbit", "dog"] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa") outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) # Run in "set against set" mode, aligning the entire ingroup # vs each outgroup runCactusBlast([ingroupPath], self.tempOutputFile, os.path.join(self.tempDir, "setVsSetJobTree"), chunkSize=500000, overlapSize=10000, targetSequenceFiles=outgroupPaths) # Run in "ingroup vs outgroups" mode, aligning the ingroup vs # the outgroups in order, trimming away sequence that's # already been aligned. system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree" % (ingroupPath, ",".join(outgroupPaths), self.tempOutputFile2, self.tempDir)) # Get the coverage on the ingroup, in bases, from each run. coverageSetVsSet = int(popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile))) coverageIngroupVsOutgroups = int(popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile2))) print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet) print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups) # Make sure we're getting a reasonable fraction of the # alignments when using the trimming strategy. self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95) # Get the coverage on the ingroup, in bases, from just the # last outgroup. Obviously this should be much higher in set # vs set mode than in ingroup vs outgroup mode. coverageFromLastOutgroupSetVsSet = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total += $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile, ingroupPath))) coverageFromLastOutgroupInVsOut = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total += $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile2, ingroupPath))) print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet) print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut) self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
def testFlanking(self): fa = popenCatch("cactus_trimSequences.py --flanking 1 --minSize 0 --windowSize 1 --threshold 1 %s %s" % (self.faPath, self.bedPath)) # The two blocks 0-5, 6-11 should be merged together since # their flanking sequence intersects. Additionally the # flanking sequence shouldn't go past the beginning sequence. self.assertTrue(dedent('''\ >seq1|0 CATGCATGCATG''') in fa) self.assertTrue(dedent('''\ >seq1|14 TGC''') in fa)
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) coverageWorkDir = getTempDirectory(rootDir=self.tempDir) calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def liftoverLine(halFile, refGenome, refBedLine, targetGenome, targetSeq=None): """Get a list of PSL lines representing the alignment on the given bed line between the refGenome and targetGenome. Optionally, filter for only lines involving a certain sequence in targetGenome. """ pslLines = popenCatch( "halLiftover --outPSL %s %s stdin %s stdout" % (halFile, refGenome, targetGenome), stdinString=refBedLine ).split("\n") pslLines = filter(lambda x: x != "", pslLines) if targetSeq is not None: pslLines = filter(lambda x: x.split()[13] == targetSeq, pslLines) return pslLines
def runCactusSetup(cactusDiskDatabaseString, sequences, newickTreeString, logLevel=None, outgroupEvents=None, makeEventHeadersAlphaNumeric=None): logLevel = getLogLevelString2(logLevel) outgroupEvents = nameValue("outgroupEvents", outgroupEvents, str, quotes=True) makeEventHeadersAlphaNumeric=nameValue("makeEventHeadersAlphaNumeric", makeEventHeadersAlphaNumeric, bool) masterMessages = popenCatch("cactus_setup %s --speciesTree '%s' --cactusDisk '%s' \ --logLevel %s %s %s" \ % (" ".join(sequences), newickTreeString, cactusDiskDatabaseString, logLevel, outgroupEvents, makeEventHeadersAlphaNumeric)) logger.info("Ran cactus setup okay") return [ i for i in masterMessages.split("\n") if i != '' ]
def main(argv=None): if argv is None: argv = sys.argv if len(argv) != 2: print "usage: runAndGetResources.py \'cmdline\'" exit(1) cmdline = argv[1] wallStart = time.time() output = popenCatch(cmdline) wallClock = time.time() - wallStart print(wallClock, ) + getTotalCpuTimeAndMemoryUsage() return 0
def main(argv=None): if argv is None: argv = sys.argv if len(argv) != 2: print "usage: runAndGetResources.py \'cmdline\'" exit(1) cmdline = argv[1] wallStart = time.time() output = popenCatch(cmdline) wallClock = time.time() - wallStart print (wallClock,) + getTotalCpuTimeAndMemoryUsage() return 0
def testComplement(self): fa = popenCatch("cactus_trimSequences.py --flanking 0 --minSize 0 --windowSize 1 --threshold 1 --complement %s %s" % (self.faPath, self.bedPath)) self.assertTrue(dedent('''\ >seq1|5 A''') in fa) self.assertTrue(dedent('''\ >seq1|11''') in fa) self.assertTrue(dedent('''\ >seq1|16''') in fa) # make sure the sequence that isn't covered at all is included self.assertTrue(dedent('''\ >seq2|0''') in fa)
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegion = "ENm001" ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] MAX_NUM_OUTGROUPS = 3 # subselect a random set of outgroups in the same order outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))] regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1, len(outgroups) + 1): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) tmpToil = os.path.join(self.tempDir, "outgroupToil") runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): ingroupCoverage = getTempFile(rootDir=self.tempDir) calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage) coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testAddingOutgroupsImprovesResult(self): """Run blast on "ingroup" and "outgroup" encode regions, and ensure that adding an extra outgroup only adds alignments if possible, and doesn't lose any """ encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ] ingroups = ["human", "macaque"] outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"] # subselect 4 random ordered outgroups outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))] for encodeRegion in encodeRegions: regionPath = os.path.join(self.encodePath, encodeRegion) ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups) outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups) results = [] for numOutgroups in xrange(1,5): # Align w/ increasing numbers of outgroups subResults = getTempFile() subOutgroupPaths = outgroupPaths[:numOutgroups] tmpJobTree = getTempDirectory() print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths)) system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/jobTree" % (",".join(ingroupPaths), ",".join(subOutgroupPaths), subResults, tmpJobTree)) system("rm -fr %s" % (tmpJobTree)) results.append(subResults) # Print diagnostics about coverage for i, subResults in enumerate(results): for ingroup, ingroupPath in zip(ingroups, ingroupPaths): coveredBases = popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2 } END { print total }'" % (ingroupPath, subResults)) print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases) resultsSets = map(lambda x : loadResults(x), results) for i, moreOutgroupsResults in enumerate(resultsSets[1:]): # Make sure the results from (n+1) outgroups are # (very nearly) a superset of the results from n outgroups print "Using %d addl outgroup(s):" % (i + 1) comparator = ResultComparator(resultsSets[0], moreOutgroupsResults) print comparator self.assertTrue(comparator.sensitivity >= 0.99) # Ensure that the new alignments don't cover more than # x% of already existing alignments to human for i in xrange(1, len(resultsSets)): prevResults = resultsSets[i-1][0] curResults = resultsSets[i][0] prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults))) newAlignments = curResults.difference(prevResults) newAlignmentsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments))) print "addl outgroup %d:" % i print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))) for subResult in results: os.remove(subResult)
def testSimplestParameters(self): # Test w/ no windowing, minimum size, etc to see if bed # import/fasta export works fa = popenCatch("cactus_trimSequences.py --flanking 0 --minSize 0 --windowSize 1 --threshold 1 %s %s" % (self.faPath, self.bedPath)) self.assertTrue(dedent('''\ >seq1|0 CATGC''') in fa) self.assertTrue(dedent('''\ >seq1|6 TGCAT''') in fa) self.assertTrue(dedent('''\ >seq1|15 G''') in fa)
def getBedLineForSequence(halFile, genome, sequence): """Get a bed line from the beginning to the end of a given sequence.""" bedLines = popenCatch( "halStats --bedSequences %s %s" % (genome, halFile)).split("\n") seqLines = filter(lambda x: x[0] == sequence, [line.split() for line in bedLines if line != ""]) if len(seqLines) > 1: raise RuntimeError("More than one sequence named %s in genome %s, " "aborting!" % (sequence, genome)) elif len(seqLines) == 0: raise RuntimeError("No sequence named %s found in genome %s" % (sequence, genome)) return "\t".join(seqLines[0])
def liftoverLine(halFile, refGenome, refBedLine, targetGenome, targetSeq=None): """Get a list of PSL lines representing the alignment on the given bed line between the refGenome and targetGenome. Optionally, filter for only lines involving a certain sequence in targetGenome. """ pslLines = popenCatch("halLiftover --outPSL %s %s stdin %s stdout" % \ (halFile, refGenome, targetGenome), stdinString=refBedLine).split("\n") pslLines = filter(lambda x: x != "", pslLines) if targetSeq is not None: pslLines = filter(lambda x: x.split()[13] == targetSeq, pslLines) return pslLines
def realign(fasta, ref): ''' :param fasta: genome fasta files :param ref: reference species ''' seqFile1 = fasta.get(ref) for genome in fasta: if genome == ref: continue axt = genome + '.chained.axt' with open(axt) as infile: block = [] for line in infile: if line: col = line.rstrip().split() block.append(col) if len(block) == 3: cig = axtToCigar(block) seqFile2 = fasta.get(genome) realignCommand = "echo '%s' | cPecanRealign %s %s %s" % (cig, "-u /dev/stdout", seqFile1, seqFile2) print popenCatch(realignCommand).split("\n") else: block = []
def getBedLineForSequence(halFile, genome, sequence): """Get a bed line from the beginning to the end of a given sequence.""" bedLines = popenCatch("halStats --bedSequences %s %s" % (genome, halFile)).split("\n") seqLines = filter(lambda x: x[0] == sequence, [line.split() for line in bedLines if line != ""]) if len(seqLines) > 1: raise RuntimeError("More than one sequence named %s in genome %s, " "aborting!" % (sequence, genome)) elif len(seqLines) == 0: raise RuntimeError("No sequence named %s found in genome %s" % (sequence, genome)) return "\t".join(seqLines[0])
def testCPecanRealignRescoreByIdentityAndProb(self): """Runs cactus realign using the default parameters and checks that the realigned output cigars align the same subsequences. """ for seqFile1, seqFile2 in seqFilePairGenerator(): realignCommandByIdentity, lastzCommand = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByIdentity") realignCommandByPosteriorProb = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByPosteriorProb")[0] realignCommandByIdentityIgnoringGaps = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByIdentityIgnoringGaps")[0] for realignLineByIdentity, realignLineByPosteriorProb, realignLineByIdentityIgnoringGaps, lastzLine in \ zip([ i for i in popenCatch(realignCommandByIdentity).split("\n") if i != '' ], \ [ i for i in popenCatch(realignCommandByPosteriorProb).split("\n") if i != '' ], \ [ i for i in popenCatch(realignCommandByIdentityIgnoringGaps).split("\n") if i != '' ], \ [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]): realignCigarByIdentity = cigarReadFromString(realignLineByIdentity) realignCigarByPosteriorProb = cigarReadFromString(realignLineByPosteriorProb) realignCigarByIdentityIgnoringGaps = cigarReadFromString(realignLineByIdentityIgnoringGaps) lastzCigar = cigarReadFromString(lastzLine) #Check scores are as expected self.assertTrue(realignCigarByIdentity.score >= 0) self.assertTrue(realignCigarByIdentity.score <= 100.0) self.assertTrue(realignCigarByPosteriorProb.score >= 0) self.assertTrue(realignCigarByPosteriorProb.score <= 100.0) self.assertTrue(realignCigarByIdentityIgnoringGaps.score >= 0) self.assertTrue(realignCigarByIdentityIgnoringGaps.score <= 100.0)
def run(self): # Find all ancestral genomes using the tree. newickStr = popenCatch("halStats --tree %s" % self.halFile) tree = NXNewick().parseString(newickStr) bedFiles = {} # genome => bed files of inserted columns for nodeId in tree.postOrderTraversal(): if len(tree.getChildren(nodeId)) == 0: # leaf node, skip continue assert tree.hasName(nodeId) genome = tree.getName(nodeId) bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir()) bedFiles[genome] = bedFileForGenome self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome)) self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta): """ Main consensus alignment function. """ ref_tx_fasta = Fasta(ref_tx_fasta) target_genome_fasta = Fasta(target_genome_fasta) tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp, target_genome_fasta) tx_seq = str(ref_tx_fasta[gp.name]) fastaWrite(tmp_ref, gp.name, tx_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] best_cov, best_ident = evaluate_blat_results(r) return map(str, [gp.id, gp.name, best_cov, best_ident])
def sanityCheckSequence(self, path): """Warns the user about common problems with the input sequences.""" # Relies on cactus_analyseAssembly output staying in the # format it's currently in. return cmdline = "cactus_analyseAssembly" if os.path.isdir(path): cmdline = "cat %s/* | %s -" % (path, cmdline) else: cmdline += " %s" % path output = popenCatch(cmdline) try: repeatMaskedFrac = float( re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1)) nFrac = float( re.search(r'ProportionNs: ([0-9.]*)', output).group(1)) except ValueError: # This can happen if the genome has 0 length, making the fractions NaN. # We warn the user but return afterwards, as the rest of the checks are # dependent on the fraction values. sys.stderr.write( "WARNING: sequence path %s has 0 length. Consider " "removing it from your input file.\n\n" % path) return # These thresholds are pretty arbitrary, but should be good for # badly- to well-assembled vertebrate genomes. if repeatMaskedFrac > 0.70: sys.stderr.write( "WARNING: sequence path %s has an extremely high " "proportion of masked bases: %f. progressiveCactus" " expects a soft-masked genome, i.e. all lowercase" " characters are considered masked. The process " "will proceed normally, but make sure you haven't " "accidentally provided an all-lowercase genome, " "in which case nothing will be aligned to " "it!\n\n" % (path, repeatMaskedFrac)) if nFrac > 0.30: sys.stderr.write("WARNING: sequence path %s has an extremely high " "proportion of 'N' bases: %f. The process will " "proceed normally, but make sure your genome " "isn't hard-masked! Alignments to hard-masked " "genomes are much worse than to soft-masked " "genomes. If the genome just has a lot of " "poorly assembled regions, feel free to " "ignore this message.\n\n" % (path, nFrac))
def find_single_copy(target, args, chunk, result_path): """ Score each region for percent single copyness """ with open(result_path, 'w') as outf: for chrom, start, stop in chunk: start = int(start) stop = int(stop) length = stop - start cmd = 'halSingleCopyRegionsExtract {} {} --refSequence {} --start {} --length {}' cmd = cmd.format(args.hal, args.ref_genome, chrom, start, length) r = popenCatch(cmd) r = r.split('\n')[:-1] tot = 0 for l in r: l = l.split() tot += int(l[-1]) - int(l[-2]) outf.write('\t'.join( map(str, [chrom, start, stop, format_ratio(tot, length)])) + '\n')
def run(self): genomes = popenCatch("halStats --genomes %s" % self.opts.halPath).split() # main outputs, entirely inserted sequence outputs, total inserted bases outputs outputss = [[], [], []] for genome in genomes: # Get a temp file to hold the genome's output, which will # be concatenated with the others at the end tempOutput = getTempFile(rootDir=self.getGlobalTempDir()) outputss[0].append(tempOutput) # Create a temp file to hold entirely inserted seqs, if needed tempEntirelyInsertedSequencesPath = None if self.opts.entirelyInsertedSequencesPath is not None: tempEntirelyInsertedSequencesPath = getTempFile( rootDir=self.getGlobalTempDir()) outputss[1].append(tempEntirelyInsertedSequencesPath) # Create a temp file to hold total inserted bases, if needed tempTotalInsertedBasesPath = None if self.opts.totalInsertedBasesPath is not None: tempTotalInsertedBasesPath = getTempFile( rootDir=self.getGlobalTempDir()) outputss[2].append(tempTotalInsertedBasesPath) self.addChildTarget( ExtractInsertions(self.opts.halPath, genome, tempOutput, self.opts.samplePerGenome, self.opts.samples, self.opts.noGaps, tempEntirelyInsertedSequencesPath, tempTotalInsertedBasesPath)) self.setFollowOnTarget( ReduceOutputs(outputss, [ self.opts.output, self.opts.entirelyInsertedSequencesPath, self.opts.totalInsertedBasesPath ], [not self.opts.samplePerGenome, False, False], [self.opts.samples, None, None], [ 'insertionSize\tgenome\tseq\tmaskedBases', 'insertionSize\tgenome\tseq\tmaskedBases', 'genome\ttotalInsertedBases' ]))
def run(self): outputsPerGenome = {} for genome, bedFile in self.bedFileDict.items(): outputsPerGenome[genome] = [] numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bedFile)) linesPerJob = int(math.ceil(float(numLines) / self.jobsPerGenome)) if linesPerJob == 0: linesPerJob = 1 for start in xrange(0, numLines, linesPerJob): end = start + linesPerJob if end > numLines: end = numLines bedForJob = getTempFile(rootDir=self.getGlobalTempDir()) system("head -n %d %s | tail -n %d > %s" % (start + linesPerJob, bedFile, end - start, bedForJob)) output = getTempFile(rootDir=self.getGlobalTempDir()) self.addChildTarget( RunAncestorsML(self.halFile, genome, bedForJob, self.phyloPModel, output)) outputsPerGenome[genome].append(output) self.setFollowOnTarget(WriteNucleotides(outputsPerGenome, self.halFile))
def align(target, g, target_fasta, chunk, ref_fasta, out_path): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] for aug_aId in chunk: aId = remove_augustus_alignment_number(aug_aId) gencode_id = remove_alignment_number(aId) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[aug_aId]) tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode") fastaWrite(tmp_aug, aug_aId, aug_seq) fastaWrite(tmp_gencode, gencode_id, gencode_seq) r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug)) r = r.split("\n")[:-3] if len(r) == 0: results.append([aug_aId, "0", "0"]) else: p_list = [PslRow(x) for x in r] results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)])) with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf: for x in results: outf.write("\t".join(x) + "\n")
def align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict, ref_tx_fasta): """ Main CGP alignment function. For each CGP transcript, uses tx_dict to BLAT against all transcripts. These alignments are then chained and the highest coverage alignment used. This circumvents problems with multiple self alignments in the case of repeats. """ results = [] ref_tx_fasta = Fasta(ref_tx_fasta) target_genome_fasta = Fasta(target_genome_fasta) tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp, target_genome_fasta) for gene_name, tx_names in tx_dict.iteritems(): for tx_name in tx_names: tx_seq = str(ref_tx_fasta[tx_name]) fastaWrite(tmp_ref, tx_name, tx_seq) system("blat {} {} -out=psl -noHead {}".format( tmp_tgt, tmp_ref, tmp_psl)) r = popenCatch( "simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] best_cov, best_ident = evaluate_blat_results(r) results.append( map(str, [gp.name, gene_name, tx_name, best_cov, best_ident])) return results