Ejemplo n.º 1
0
def runAsMedianMedianProblemTest(medianHistory):
    """Runs AsMedian, requires to be installed. I got it from:
    https://sites.google.com/site/andrewweixu/Home/software/asmedian
    """
    #Dump to disk
    tempFile = os.path.join(os.getcwd(), "simulatedGenomeTempFile.txt")
    fileHandle = open(tempFile, 'w')
    fileHandle.write(medianHistory.getLeafGenomeString())
    fileHandle.close()
    #-cp /Users/benedictpaten/Desktop/ASMedian-1.0
    popenCatch("java BIOMedian %s" % tempFile)
    os.remove(tempFile)
    #Parse in
    fileHandle = open(tempFile + ".rst", 'r')
    input = fileHandle.readlines()
    fileHandle.close()
    os.remove(tempFile + ".rst")
    asMedianMedianGenome = Genome(chromosomeNumber=0, elementNumber=0)
    for line in input[1:]:
        if line[0] == '#':
            break
        asMedianChromosome = Chromosome()
        for element in line.split()[1:]:
            asMedianChromosome.append(int(element))
        asMedianMedianGenome.addChromosome(asMedianChromosome)
    return asMedianMedianGenome
def runAsMedianMedianProblemTest(medianHistory):
    """Runs AsMedian, requires to be installed. I got it from:
    https://sites.google.com/site/andrewweixu/Home/software/asmedian
    """
    #Dump to disk
    tempFile = os.path.join(os.getcwd(), "simulatedGenomeTempFile.txt")
    fileHandle = open(tempFile, 'w')
    fileHandle.write(medianHistory.getLeafGenomeString())
    fileHandle.close()
    #-cp /Users/benedictpaten/Desktop/ASMedian-1.0 
    popenCatch("java BIOMedian %s" % tempFile)
    os.remove(tempFile)
    #Parse in
    fileHandle = open(tempFile + ".rst", 'r')
    input = fileHandle.readlines()
    fileHandle.close()
    os.remove(tempFile + ".rst")
    asMedianMedianGenome = Genome(chromosomeNumber=0, elementNumber=0)
    for line in input[1:]:
        if line[0] == '#':
            break
        asMedianChromosome = Chromosome()
        for element in line.split()[1:]:
            asMedianChromosome.append(int(element))
        asMedianMedianGenome.addChromosome(asMedianChromosome)
    return asMedianMedianGenome
Ejemplo n.º 3
0
 def testCPecanRealignSplitSequences(self):
     """Runs cPecanRealign, splitting indels longer than 100bp, and check
     that the coverage from the results is the same as the coverage from
     realigning with no arguments.."""
     for seqFile1, seqFile2 in seqFilePairGenerator():
         # Drop the lastz command since it's not needed. But this
         # is still convenient to use the same parameters as all
         # the other tests
         realignCommand, _ = getCommands(seqFile1, seqFile2)
         splitRealignCommand = realignCommand + " --splitIndelsLongerThanThis 100"
         realignOutput = getTempFile()
         splitRealignOutput = getTempFile()
         realignCommand += " > %s" % realignOutput
         splitRealignCommand += " > %s" % splitRealignOutput
         system(realignCommand)
         system(splitRealignCommand)
         # Check coverage on seqFile1
         
         #The following will fail until we refactor.
         
         splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, splitRealignOutput))
         realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile1, realignOutput))
         self.assertTrue(splitRealignCoverage == realignCoverage)
         # Check coverage on seqFile2
         splitRealignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, splitRealignOutput))
         realignCoverage = popenCatch("cactus_coverage %s %s" % (seqFile2, realignOutput))
         self.assertTrue(splitRealignCoverage == realignCoverage)
         os.remove(realignOutput)
         os.remove(splitRealignOutput)
Ejemplo n.º 4
0
    def testProgressiveOutgroupsVsAllOutgroups(self):
        """Tests the difference in outgroup coverage on an ingroup when
        running in "ingroups vs. outgroups" mode and "set against set"
        mode.
        """
        encodeRegion = "ENm001"
        ingroup = "human"
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa")
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run in "set against set" mode, aligning the entire ingroup
        # vs each outgroup
        runCactusBlast([ingroupPath], alignmentsFile=self.tempOutputFile,
                       toilDir=os.path.join(self.tempDir, "setVsSetToil"),
                       chunkSize=500000, overlapSize=10000,
                       targetSequenceFiles=outgroupPaths)
        # Run in "ingroup vs outgroups" mode, aligning the ingroup vs
        # the outgroups in order, trimming away sequence that's
        # already been aligned.
        runCactusBlastIngroupsAndOutgroups([ingroupPath], outgroupPaths, alignmentsFile=self.tempOutputFile2, toilDir=os.path.join(self.tempDir, "outgroupToil"))

        # Get the coverage on the ingroup, in bases, from each run.
        coverageSetVsSetUnfiltered = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile, outputFile=coverageSetVsSetUnfiltered)
        coverageSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageSetVsSetUnfiltered))
        coverageIngroupVsOutgroupsUnfiltered = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=self.tempOutputFile2, outputFile=coverageIngroupVsOutgroupsUnfiltered)
        coverageIngroupVsOutgroups = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageIngroupVsOutgroupsUnfiltered))

        print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet)
        print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups)

        # Make sure we're getting a reasonable fraction of the
        # alignments when using the trimming strategy.
        self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95)

        # Get the coverage on the ingroup, in bases, from just the
        # last outgroup. Obviously this should be much higher in set
        # vs set mode than in ingroup vs outgroup mode.
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile, outgroupAlignments))
        coverageFileSetVsSet = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileSetVsSet)
        
        coverageFromLastOutgroupSetVsSet = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileSetVsSet))

        
        outgroupAlignments = getTempFile(rootDir=self.tempDir)
        system("grep %s %s > %s" % (outgroups[-1], self.tempOutputFile2, outgroupAlignments))
        coverageFileInVsOut = getTempFile(rootDir=self.tempDir)
        calculateCoverage(sequenceFile=ingroupPath, cigarFile=outgroupAlignments, outputFile=coverageFileInVsOut)      
        coverageFromLastOutgroupInVsOut = int(popenCatch("cat %s | awk '{ total +=  $3 - $2} END { print total }'" % coverageFileInVsOut))

        print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet)
        print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut)

        self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
Ejemplo n.º 5
0
 def testCPecanRealign(self):
     """Runs cPecanRealign using the default parameters and checks that the realigned output cigars align
     the same subsequences.
     """
     for seqFile1, seqFile2 in seqFilePairGenerator():
         realignCommand, lastzCommand = getCommands(seqFile1, seqFile2)
         for realignLine, lastzLine in zip([ i for i in popenCatch(realignCommand).split("\n") if i != '' ], 
                                           [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]):
             realignCigar = cigarReadFromString(realignLine)
             lastzCigar = cigarReadFromString(lastzLine)
             self.assertTrue(realignCigar.sameCoordinates(lastzCigar))
Ejemplo n.º 6
0
 def testCPecanRealignDummy(self):
     """Runs cPecanRealign using the "rescoreOriginalAlignment" mode
     and checks the output is equivalent to what you'd get by just running lastz.
     """
     for seqFile1, seqFile2 in seqFilePairGenerator():
         realignCommand, lastzCommand = getCommands(seqFile1, seqFile2, "--rescoreOriginalAlignment")
         for realignLine, lastzLine in zip([ i for i in popenCatch(realignCommand).split("\n") if i != '' ], 
                                           [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]):
             realignCigar = cigarReadFromString(realignLine)
             lastzCigar = cigarReadFromString(lastzLine)
             self.assertTrue(realignCigar != None)
             self.assertTrue(realignCigar == lastzCigar)
Ejemplo n.º 7
0
def getBedLineForSequence(halFile, genome, sequence, start, length):
    """Get a bed line from the beginning to the end of a given
    sequence. If start and length are None, the full sequence is
    returned, otherwise only the given region is.
    """
    bedLines = popenCatch("halStats --bedSequences %s %s" %
                          (genome, halFile)).split("\n")
    seqLines = filter(lambda x: x[0] == sequence,
                      [line.split() for line in bedLines if line != ""])
    if len(seqLines) > 1:
        raise RuntimeError("More than one sequence named %s in genome %s, "
                           "aborting!" % (sequence, genome))
    elif len(seqLines) == 0:
        raise RuntimeError("No sequence named %s found in genome %s" %
                           (sequence, genome))
    if start is None and length is None:
        return "\t".join(seqLines[0])
    elif start is not None and length is not None:
        if start + length > int(seqLines[0][2]):
            raise RuntimeError("Selected region runs off end of sequence.")
        seqLines[0][1] = str(start)
        seqLines[0][2] = str(start + length)
        return "\t".join(seqLines[0])
    else:
        raise RuntimeError("Both start and length must be provided.")
Ejemplo n.º 8
0
def runCactusReference(cactusDiskDatabaseString, flowerNames, logLevel=None,
                       matchingAlgorithm=None, 
                       referenceEventString=None, 
                       permutations=None,
                       useSimulatedAnnealing=None,
                       theta=None,
                       phi=None, 
                       maxWalkForCalculatingZ=None,
                       ignoreUnalignedGaps=None,
                       wiggle=None, 
                       numberOfNs=None,
                       minNumberOfSequencesToSupportAdjacency=None,
                       makeScaffolds=None):
    """Runs cactus reference.
    """
    logLevel = getLogLevelString2(logLevel)
    matchingAlgorithm = nameValue("matchingAlgorithm", matchingAlgorithm)
    referenceEventString = nameValue("referenceEventString", referenceEventString)
    permutations = nameValue("permutations", permutations, int)
    useSimulatedAnnealing = nameValue("useSimulatedAnnealing", useSimulatedAnnealing, bool)
    theta = nameValue("theta", theta, float)
    phi = nameValue("phi", phi, float)
    maxWalkForCalculatingZ = nameValue("maxWalkForCalculatingZ", maxWalkForCalculatingZ, int)
    ignoreUnalignedGaps = nameValue("ignoreUnalignedGaps", ignoreUnalignedGaps, bool)
    wiggle = nameValue("wiggle", wiggle, float)
    numberOfNs = nameValue("numberOfNs", numberOfNs, int)
    minNumberOfSequencesToSupportAdjacency = nameValue("minNumberOfSequencesToSupportAdjacency", minNumberOfSequencesToSupportAdjacency, int)
    makeScaffolds = nameValue("makeScaffolds", makeScaffolds, bool)
    command = "cactus_reference --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s" % \
    (cactusDiskDatabaseString, logLevel, matchingAlgorithm, referenceEventString, permutations, 
     useSimulatedAnnealing, theta, phi, maxWalkForCalculatingZ, ignoreUnalignedGaps, wiggle, numberOfNs, minNumberOfSequencesToSupportAdjacency, makeScaffolds)
    masterMessages = popenCatch(command, stdinString=flowerNames)
    logger.info("Ran cactus_reference okay")
    return [ i for i in masterMessages.split("\n") if i != '' ]
Ejemplo n.º 9
0
def get_chromosomes(hal, ref_genome):
    """
    Returns a set of chromosomes present in the reference genome.
    """
    sizes = popenCatch('halStats {} --chromSizes {}'.format(hal, ref_genome))
    sizes = sizes.split("\n")[:-1]  # last line is empty newline
    return {x.split()[0] for x in sizes}
def test_ancestral_nodes(target, region_specific_conserved,
                         accelerated_genomes, maf_path, region_bed,
                         outf_handle):
    """Run LRT test on each ancestral node below the common ancestor of the accelerated genomes"""
    for anc_name, branch_model in rename_model(target,
                                               region_specific_conserved,
                                               accelerated_genomes):
        assert len(anc_name) > 0, (anc_name, branch_model,
                                   region_specific_conserved,
                                   accelerated_genomes)
        region_specific_accelerated = os.path.join(
            target.getGlobalTempDir(), 'region_specific_accelerated')
        cmd = 'phyloFit --init-model {} --scale-subtree {}:loss --out-root {} {}'
        cmd = cmd.format(branch_model, anc_name, region_specific_accelerated,
                         maf_path)
        region_specific_accelerated += '.mod'
        system(cmd)
        cmd = 'phastOdds --output-bed --features {} --background-mods {} --feature-mods {} {}'
        cmd = cmd.format(region_bed, branch_model, region_specific_accelerated,
                         maf_path)
        r = popenCatch(cmd)
        l = r.split()
        # discard the result if the test is not positive
        if int(l[-1]) > 0:
            l[-2] = anc_name
            outf_handle.write('\t'.join(l) + '\n')
Ejemplo n.º 11
0
 def sanityCheckSequence(self, path):
     """Warns the user about common problems with the input sequences."""
     # Relies on cactus_analyseAssembly output staying in the
     # format it's currently in.
     cmdline = "cactus_analyseAssembly"
     if os.path.isdir(path):
         cmdline = "cat %s/* | %s -" % (path, cmdline)
     else:
         cmdline += " %s" % path
     output = popenCatch(cmdline)
     # We don't do error-checking here, all we'll get is a prettier
     # error message and it will be pretty obvious what's going on
     # (i.e. the analyseAssembly output will have changed)
     repeatMaskedFrac = float(re.search(r'Proportion-repeat-masked: ([0-9.]*)', output).group(1))
     nFrac = float(re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
     # These thresholds are pretty arbitrary, but should be good for
     # badly- to well-assembled vertebrate genomes.
     if repeatMaskedFrac > 0.70:
         sys.stderr.write("WARNING: sequence path %s has an extremely high "
                          "proportion of masked bases: %f. progressiveCactus"
                          " expects a soft-masked genome, i.e. all lowercase"
                          " characters are considered masked. The process "
                          "will proceed normally, but make sure you haven't "
                          "accidentally provided an all-lowercase genome, "
                          "in which case nothing will be aligned to "
                          "it!\n\n" % (path, repeatMaskedFrac))
     if nFrac > 0.30:
         sys.stderr.write("WARNING: sequence path %s has an extremely high "
                          "proportion of 'N' bases: %f. The process will "
                          "proceed normally, but make sure your genome "
                          "isn't hard-masked! Alignments to hard-masked "
                          "genomes are much worse than to soft-masked "
                          "genomes. If the genome just has a lot of "
                          "poorly assembled regions, feel free to "
                          "ignore this message.\n\n" % (path, nFrac))
Ejemplo n.º 12
0
 def getChunks(self, sequenceFiles, chunksDir):
     return [ chunk for chunk in popenCatch("cactus_blast_chunkSequences %s %i %i %s %s" % \
                                                       (getLogLevelString(), 
                                                       self.blastOptions.chunkSize, 
                                                       self.blastOptions.overlapSize,
                                                       chunksDir,
                                                       " ".join(sequenceFiles))).split("\n") if chunk != "" ]
Ejemplo n.º 13
0
 def run(self):
     logger.info("Preparing sequence for preprocessing")
     # chunk it up
     inChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksIn"))
     inChunkList = [
         chunk
         for chunk in popenCatch(
             "cactus_blast_chunkSequences %s %i 0 %s %s"
             % (getLogLevelString(), self.prepOptions.chunkSize, inChunkDirectory, self.inSequencePath)
         ).split("\n")
         if chunk != ""
     ]
     outChunkDirectory = makeSubDir(os.path.join(self.getGlobalTempDir(), "preprocessChunksOut"))
     outChunkList = []
     # For each input chunk we create an output chunk, it is the output chunks that get concatenated together.
     for i in xrange(len(inChunkList)):
         outChunkList.append(os.path.join(outChunkDirectory, "chunk_%i" % i))
         # Calculate the number of chunks to use
         inChunkNumber = int(max(1, math.ceil(len(inChunkList) * self.prepOptions.proportionToSample)))
         assert inChunkNumber <= len(inChunkList) and inChunkNumber > 0
         # Now get the list of chunks flanking and including the current chunk
         j = max(0, i - inChunkNumber / 2)
         inChunks = inChunkList[j : j + inChunkNumber]
         if len(inChunks) < inChunkNumber:  # This logic is like making the list circular
             inChunks += inChunkList[: inChunkNumber - len(inChunks)]
         assert len(inChunks) == inChunkNumber
         self.addChildTarget(
             PreprocessChunk(
                 self.prepOptions, inChunks, float(inChunkNumber) / len(inChunkList), inChunkList[i], outChunkList[i]
             )
         )
     # follow on to merge chunks
     self.setFollowOnTarget(MergeChunks(self.prepOptions, outChunkList, self.outSequencePath))
Ejemplo n.º 14
0
def run_augustus(target, hint_f, seq_f, name, start, stop, aln_start, aln_stop,
                 cfg_version, cfg_path, out_file_tree):
    """
    Runs Augustus for each cfg/gp_string pair.
    """
    cmd = augustus_cmd.format(fasta=seq_f,
                              start=start,
                              stop=stop,
                              cfg=cfg_path,
                              hints=hint_f)
    r = popenCatch(cmd)
    r = r.split("\n")
    # extract only the transcript lines
    l = [x.split() for x in r if "\ttranscript\t" in x]
    # filter out transcripts that do not overlap the alignment range
    transcripts = [
        x[-1] for x in l if not (int(x[4]) < start or int(x[3]) > stop)
    ]
    # if we lose everything, stop here
    if len(transcripts) > 0:
        # rename transcript based on cfg version, and make names unique
        name_map = rename_transcripts(transcripts, cfg_version, name)
        # write this to a shared location where we will combine later
        out_path = out_file_tree.getTempFile()
        write_augustus(r, name_map, out_path, start)
Ejemplo n.º 15
0
    def __getSeqInfo(self, faPaths, event):
        cmdLine = "cactus_analyseAssembly"
        for faPath in faPaths:
            if not os.path.isfile(faPath):
                raise RuntimeError("Unable to open sequence file %s" % faPath)
            cmdLine += " %s" % faPath
        isCandidate = False
        if self.candidateSet is not None and event in self.candidateSet:
            isCandidate = True
        analyseOutput = popenCatch(cmdLine).split()
        tsIdx = analyseOutput.index("Total-sequences:")
        assert tsIdx >= 0 and tsIdx < len(analyseOutput) - 1
        numSequences = int(analyseOutput[tsIdx + 1])
        tlIdx = analyseOutput.index("Total-length:")
        assert tlIdx >= 0 and tlIdx < len(analyseOutput) - 1
        totalLength = int(analyseOutput[tlIdx + 1])
        nsIdx = analyseOutput.index("ProportionNs:")
        assert nsIdx >= 0 and nsIdx < len(analyseOutput) - 1
        nsPct = float(analyseOutput[nsIdx + 1])
        rmIdx = analyseOutput.index("Proportion-repeat-masked:")
        assert rmIdx >= 0 and rmIdx < len(analyseOutput) - 1
        rmPct = float(analyseOutput[rmIdx + 1])
        assert rmPct <= 1. and rmPct >= 0.
        n50Idx = analyseOutput.index("N50:")
        assert n50Idx >= 0 and n50Idx < len(analyseOutput) - 1
        n50 = int(analyseOutput[n50Idx + 1])
        
        if isCandidate is True:
            totalLength *= self.candidateBoost
            n50 *= self.candidateBoost

        umLength = max(0, totalLength * (1. - nsPct))
        umN50 = max(0, n50 * (1. - nsPct))
        
        return self.SeqInfo(numSequences, totalLength, umLength, n50, umN50)
Ejemplo n.º 16
0
    def _run_evolver_decomposed_no_outgroup(self, binariesMode):
        """ Run just the mouse-rat alignment.  Inspired by issues arising here
        https://github.com/ComparativeGenomicsToolkit/cactus/pull/216
        https://github.com/ComparativeGenomicsToolkit/cactus/pull/217 """

        out_dir = os.path.join(self.tempDir, 'output')
        out_seqfile = os.path.join(out_dir, 'evolverMammalsOut.txt')
        in_seqfile = os.path.join(self.tempDir, 'evolverMammalsIn.txt')
        with open(in_seqfile, 'w') as inseq:
            inseq.write('(simMouse_chr6:0.084509,simRat_chr6:0.091589);\n')
            inseq.write('simMouse_chr6 http://s3-us-west-2.amazonaws.com/jcarmstr-misc/testRegions/evolverMammals/simMouse.chr6\n')
            inseq.write('simRat_chr6 http://s3-us-west-2.amazonaws.com/jcarmstr-misc/testRegions/evolverMammals/simRat.chr6\n')

        cmd = ['cactus-prepare', in_seqfile, '--outDir', out_dir, '--outSeqFile', out_seqfile, '--outHal', self._out_hal(binariesMode),
               '--jobStore', self._job_store(binariesMode)]
        job_plan = popenCatch(' '.join(cmd))

        for line in job_plan.split('\n'):
            line = line.strip()
            if len(line) > 0 and not line.startswith('#'):
                # todo interface in prepare
                if line.startswith('cactus-'):
                    line += ' --binariesMode {}'.format(binariesMode)
                    if binariesMode == 'docker':
                        line += ' --latest'
                if line.startswith('cactus-align'):
                    #Remove all the id prefixes to pretend the cigars came not cactus-blast
                    subprocess.check_call('sed -i -e \'s/id=[0,1]|//g\' {}/Anc0.cigar*'.format(out_dir), shell=True)
                    line += ' --nonBlastInput'
                sys.stderr.write('Running {}'.format(line))
                subprocess.check_call(line, shell=True)
Ejemplo n.º 17
0
def runCactusFlowerStats(cactusDiskDatabaseString, flowerName, logLevel=None):
    """Prints stats for the given flower
    """
    logLevel = getLogLevelString2(logLevel)
    flowerStatsString = popenCatch("cactus_workflow_flowerStats %s '%s' %s" % 
                              (logLevel, cactusDiskDatabaseString, flowerName))
    return flowerStatsString.split("\n")[0]
Ejemplo n.º 18
0
    def _run_evolver_decomposed(self, name):
        """ Run the full evolver test, putting the jobstore and output in tempDir
        but instead of doing in in one shot like above, use cactus-prepare, cactus-blast
        and cactus-align to break it into different steps """

        out_dir = os.path.join(self.tempDir, 'output')
        out_seqfile = os.path.join(out_dir, 'evolverMammalsOut.txt')
        in_seqfile = './examples/evolverMammals.txt'
        cmd = [
            'cactus-prepare', in_seqfile, '--outDir', out_dir, '--outSeqFile',
            out_seqfile, '--outHal',
            self._out_hal(name), '--jobStore',
            self._job_store(name)
        ]

        job_plan = popenCatch(' '.join(cmd))

        for line in job_plan.split('\n'):
            line = line.strip()
            if len(line) > 0 and not line.startswith('#'):
                # do Anc2 in binariesMode docker to broaden test coverage
                if 'Anc2' in line and line.startswith('cactus-'):
                    line += ' --binariesMode docker --latest'
                sys.stderr.write('Running {}'.format(line))
                subprocess.check_call(line, shell=True)
    def run(self):
        speciesTree = popenCatch("halStats --tree %s" % (self.opts.halFile)).strip()
        chromSizes = getChromSizes(self.opts.halFile, self.opts.refGenome)

        positions = []
        # For ensuring that a column isn't counted multiple times from
        # different reference positions.
        positionSet = set(positions)
        for i in xrange(self.opts.numSamples):
            # Have to sample the columns here since otherwise it can
            # be difficult to independently seed several RNGs
            pos = samplePosition(chromSizes)
            if pos not in positionSet:
                positions.append(pos)
                positionSet.add(pos)

        outputs = []
        for sliceStart in xrange(0, self.opts.numSamples,
                                 self.opts.samplesPerJob):
            slice = positions[sliceStart:sliceStart + self.opts.samplesPerJob]
            outputFile = getTempFile(rootDir=self.getGlobalTempDir())
            outputs.append(outputFile)
            self.addChildTarget(ScoreColumns(self.opts, slice,
                                             outputFile, speciesTree, positionSet))
        self.setFollowOnTarget(Summarize(self.opts, outputs, self.opts.outputFile, self.opts.writeMismatchesToFile))
Ejemplo n.º 20
0
 def testInvariants(self):
     (seqs, _) = getCactusInputs_encode(random.uniform(0, 2))
     # Chimp encode input has duplicate header names.
     seqs = [i for i in seqs if 'chimp' not in i]
     seqs = random.sample(seqs, 2)
     cigarPath = getTempFile()
     system("cPecanLastz --format=cigar %s[multiple] %s[multiple] > %s" % \
            (seqs[0], seqs[1], cigarPath))
     bed = popenCatch("cactus_coverage %s %s" % (seqs[1], cigarPath))
     prevChrom = None
     prevStart = None
     prevEnd = None
     # Check that everything is sorted and there are no overlaps
     for line in bed.split("\n"):
         line.strip()
         if line == "":
             continue
         fields = line.split()
         chrom = fields[0]
         start = int(fields[1])
         end = int(fields[2])
         self.assertTrue(end - start >= 1)
         if chrom == prevChrom:
             self.assertTrue(start > prevStart)
             self.assertTrue(start >= prevEnd)
     os.remove(cigarPath)
Ejemplo n.º 21
0
 def testFromC(self):
     # Test "--from" filtering by filtering for only alignments
     # from/to D on C.
     bed = popenCatch("cactus_coverage %s %s --from %s" % (self.simpleFastaPathC, self.simpleCigarPath, self.simpleFastaPathD))
     self.assertEqual(bed, dedent('''\
     id=3|simpleSeqC1\t0\t10\t\t1
     '''))
Ejemplo n.º 22
0
def runCactusBar(cactusDiskDatabaseString, flowerNames, logLevel=None,
                         spanningTrees=None, maximumLength=None, 
                         gapGamma=None,
                         matchGamma=None,
                         splitMatrixBiggerThanThis=None,
                         anchorMatrixBiggerThanThis=None,
                         repeatMaskMatrixBiggerThanThis=None,
                         diagonalExpansion=None,
                         constraintDiagonalTrim=None,
                         minimumBlockDegree=None,
                         minimumIngroupDegree=None,
                         minimumOutgroupDegree=None,
                         alignAmbiguityCharacters=None,
                         pruneOutStubAlignments=None,
                         useProgressiveMerging=None,
                         calculateWhichEndsToComputeSeparately=None,
                         largeEndSize=None,
                         endAlignmentsToPrecomputeOutputFile=None,
                         precomputedAlignments=None,
                         ingroupCoverageFile=None,
                         minimumSizeToRescue=None,
                         minimumCoverageToRescue=None,
                         minimumNumberOfSpecies=None):
    """Runs cactus base aligner.
    """
    logLevel = getLogLevelString2(logLevel)
    maximumLength = nameValue("maximumLength", maximumLength, int)
    spanningTrees = nameValue("spanningTrees", spanningTrees, int)
    gapGamma = nameValue("gapGamma", gapGamma, float)
    matchGamma = nameValue("matchGamma", matchGamma, float)
    splitMatrixBiggerThanThis=nameValue("splitMatrixBiggerThanThis", splitMatrixBiggerThanThis, int)
    anchorMatrixBiggerThanThis=nameValue("anchorMatrixBiggerThanThis", anchorMatrixBiggerThanThis, int)
    repeatMaskMatrixBiggerThanThis=nameValue("repeatMaskMatrixBiggerThanThis", repeatMaskMatrixBiggerThanThis, int)                   
    diagonalExpansion=nameValue("diagonalExpansion", diagonalExpansion, int)
    constraintDiagonalTrim = nameValue("constraintDiagonalTrim", constraintDiagonalTrim, int)
    minimumBlockDegree = nameValue("minimumDegree", minimumBlockDegree, int)
    minimumIngroupDegree = nameValue("minimumIngroupDegree", minimumIngroupDegree, int)
    minimumOutgroupDegree = nameValue("minimumOutgroupDegree", minimumOutgroupDegree, int)
    pruneOutStubAlignments = nameValue("pruneOutStubAlignments", pruneOutStubAlignments, bool)
    alignAmbiguityCharacters = nameValue("alignAmbiguityCharacters", alignAmbiguityCharacters, bool)
    useProgressiveMerging=nameValue("useProgressiveMerging", useProgressiveMerging, bool)
    calculateWhichEndsToComputeSeparately=nameValue("calculateWhichEndsToComputeSeparately", calculateWhichEndsToComputeSeparately, bool)
    largeEndSize=nameValue("largeEndSize", largeEndSize, int)
    endAlignmentsToPrecomputeOutputFile=nameValue("endAlignmentsToPrecomputeOutputFile", endAlignmentsToPrecomputeOutputFile, str)
    precomputedAlignments=nameValue("precomputedAlignments", precomputedAlignments, str, quotes=True)
    ingroupCoverageFile = nameValue("ingroupCoverageFile", ingroupCoverageFile, str, quotes=True)
    minimumSizeToRescue = nameValue("minimumSizeToRescue", minimumSizeToRescue, int)
    minimumCoverageToRescue = nameValue("minimumCoverageToRescue", minimumCoverageToRescue, float)
    minimumNumberOfSpecies = nameValue("minimumNumberOfSpecies", minimumNumberOfSpecies, int)

    masterMessages = popenCatch("cactus_bar --cactusDisk '%s' --logLevel %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s %s" % 
           (cactusDiskDatabaseString, logLevel, spanningTrees, maximumLength, gapGamma, matchGamma,
            splitMatrixBiggerThanThis, anchorMatrixBiggerThanThis, repeatMaskMatrixBiggerThanThis,
            constraintDiagonalTrim, minimumBlockDegree, minimumIngroupDegree, minimumOutgroupDegree,  
            alignAmbiguityCharacters, pruneOutStubAlignments, diagonalExpansion,
            useProgressiveMerging, calculateWhichEndsToComputeSeparately,
            largeEndSize, endAlignmentsToPrecomputeOutputFile, precomputedAlignments, ingroupCoverageFile, minimumSizeToRescue, minimumCoverageToRescue, minimumNumberOfSpecies), stdinString=flowerNames)
    logger.info("Ran cactus_bar okay")
    return [ i for i in masterMessages.split("\n") if i != '' ]
Ejemplo n.º 23
0
 def testSimpleCoverageOnB(self):
     # Genome B
     bed = popenCatch("cactus_coverage %s %s" % (self.simpleFastaPathB,
                                                 self.simpleCigarPath))
     self.assertEqual(bed, dedent('''\
     id=2|simpleSeqB1\t0\t12\t\t1
     id=2|simpleSeqB1\t17\t19\t\t1
     id=2|simpleSeqB1\t21\t32\t\t1
     '''))
Ejemplo n.º 24
0
def percentCoverage(sequenceFile, coverageFile):
    """Get the % coverage of a sequence from a coverage file."""
    sequenceLen = sequenceLength(sequenceFile)
    if sequenceLen == 0:
        return 0
    coverage = popenCatch("awk '{ total += $3 - $2 } END { print total }' %s" % coverageFile)
    if coverage.strip() == '': # No coverage lines
        return 0
    return 100*float(coverage)/sequenceLen
Ejemplo n.º 25
0
def get_masked_bases(ftp_url):
    if ftp_url == '':
        return 0
    paths = ftp_url.split('/')
    rm_out_url = ftp_url + '/' + paths[-1] + '_rm.out.gz'
    output = popenCatch(
        "curl -s %s | gzip -d | sed '1,3d' | awk '{total += $7 - $6} END { print total }'"
        % rm_out_url)
    return int(output)
Ejemplo n.º 26
0
def splitBed(bed, numParts):
    """Split up a bed file by lines into N parts, return the paths of the split files"""
    numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bed))
    # Random suffix so two runs on the same file don't collide
    suffix = "".join(
        [random.choice(string.ascii_uppercase) for _ in xrange(7)])
    system("split -l %d %s %s.temp.%s" %
           (math.ceil(float(numLines) / numParts), bed, bed, suffix))
    return glob('%s.temp.%s*' % (bed, suffix))
Ejemplo n.º 27
0
def main():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('hal', help='hal file')
    parser.add_argument('refGenome', help='reference genome')
    parser.add_argument('halTreeMutationsDir',
                        help='the directory output by halTreeMutations.py')
    parser.add_argument(
        '--targets',
        help='target genomes (comma-separated), default: all leaves')
    parser.add_argument('outputDir',
                        help='output directory for reference beds')
    opts = parser.parse_args()

    # Get the species tree from the hal file.
    newickTree = popenCatch('halStats --tree %s' % (opts.hal))
    tree = NXNewick().parseString(newickTree)

    # Set the target genomes to be all leaves (minus the reference) if not otherwise directed.
    leafGenomes = [tree.getName(x) for x in tree.getLeaves()]
    if opts.refGenome not in leafGenomes:
        raise ValueError("Reference genome %s is not a leaf genome." %
                         opts.refGenome)
    if opts.targets is None:
        opts.targets = [x for x in leafGenomes if x != opts.refGenome]
    else:
        opts.targets = opts.targets.split(',')
        if not all([x in leafGenomes for x in opts.targets]):
            raise ValueError("Some target genomes are not leaves.")

    try:
        os.makedirs(opts.outputDir)
    except:
        if not os.path.isdir(opts.outputDir):
            raise

    for target in opts.targets:
        refID = getTreeID(tree, opts.refGenome)
        targetID = getTreeID(tree, target)
        mrca = getMRCA(tree, refID, targetID)
        pathToTarget = getPath(opts.hal, opts.refGenome, target)
        pathUp, pathDown = [
            list(v) for k, v in groupby(
                pathToTarget, lambda x: x == tree.getName(mrca)) if k != True
        ]
        bedForTarget = os.path.join(opts.outputDir, target + '.bed')
        # First, walk up the tree to the MRCA.
        for curGenome in pathUp:
            liftMutations(opts.halTreeMutationsDir,
                          opts.hal,
                          curGenome,
                          opts.refGenome,
                          bedForTarget,
                          reversePolarity=True)
        # Next, walk down the tree to the target.
        for curGenome in pathDown:
            liftMutations(opts.halTreeMutationsDir, opts.hal, curGenome,
                          opts.refGenome, bedForTarget)
Ejemplo n.º 28
0
def runReferenceMedianProblemTest(medianHistory, greedyIterations, theta):
    """Runs the reference problem for a given median history
    """
    #Make adjacencies
    stubNumber = 2
    nodeNumber = len(
        medianHistory.getMedianGenome().getElements()) * 2 + stubNumber
    weights = {}
    for genome in medianHistory.getLeafGenomes():
        for node1, node2, distance in genome.getTransitiveAdjacencies():
            if (node1, node2) in weights:
                weights[(node1, node2)] += weightFn(distance, theta)
            else:
                weights[(node1, node2)] = weightFn(distance, theta)

    def translateLeftSideOfElementToNode(element):
        assert element != 0
        if element < 0:
            return abs(element) * 2
        return element * 2 + 1

    def translateLeftNodeToElement(node):
        assert node >= stubNumber
        assert node < nodeNumber
        element = node / 2
        if (node % 2) == 0:
            element *= -1
        return element

    #Now print out the
    input = "%i\t%i\t%i\t%i\t%s" % (
        greedyIterations, nodeNumber, stubNumber, len(weights.keys()),
        "\t".join([
            "%i\t%i\t%f" %
            (translateLeftSideOfElementToNode(-node1),
             translateLeftSideOfElementToNode(node2), weights[(node1, node2)])
            for (node1, node2) in weights.keys()
        ]))
    tempPath = getTempFile()
    with open(tempPath, 'w') as tempFile:
        tempFile.write(input)
    #Command
    command = os.path.join(
        os.path.split(
            os.path.abspath(
                matchingAndOrdering.tests.simulatedGenome.__file__))[0],
        "testBin", "referenceMedianProblemTest2")
    output = popenCatch(command + " < %s" % tempPath)
    os.remove(tempPath)
    medianChromosome = Chromosome()
    for adjacency in output.split():
        medianChromosome.append(translateLeftNodeToElement(int(adjacency)))
    medianGenome = Genome(chromosomeNumber=0, elementNumber=0)
    medianGenome.addChromosome(medianChromosome)
    assert medianGenome.getElements() == medianHistory.getMedianGenome(
    ).getElements()
    return medianGenome
Ejemplo n.º 29
0
def percentCoverage(sequenceFile, coverageFile):
    """Get the % coverage of a sequence from a coverage file."""
    sequenceLen = sequenceLength(sequenceFile)
    if sequenceLen == 0:
        return 0
    coverage = popenCatch("awk '{ total += $3 - $2 } END { print total }' %s" % coverageFile)
    if coverage.strip() == '': # No coverage lines
        return 0
    return 100*float(coverage)/sequenceLen
def getChromSizes(halPath, genome):
    """Get a dictionary of (chrom name):(chrom size) from a hal file."""
    output = popenCatch("halStats --chromSizes %s %s" % (genome, halPath))
    ret = {}
    for line in output.split("\n"):
        fields = line.split("\t")
        if len(fields) != 2:
            continue
        ret[fields[0]] = int(fields[1])
    return ret
Ejemplo n.º 31
0
 def testDepthByIDOnB(self):
     # Genome B using depthByID: should be the same as normal
     # except for 30-31, where it should be 2
     bed = popenCatch("cactus_coverage --depthById %s %s" % (
         self.simpleFastaPathB, self.simpleCigarPath))
     self.assertEqual(bed, dedent('''\
     id=2|simpleSeqB1\t0\t12\t\t1
     id=2|simpleSeqB1\t17\t19\t\t1
     id=2|simpleSeqB1\t21\t32\t\t1
     '''))
Ejemplo n.º 32
0
 def run(self):
     chunksDir = makeSubDir(os.path.join(self.getGlobalTempDir(), "chunks"))
     chunks = [ chunk for chunk in popenCatch("cactus_blast_chunkFlowerSequences %s '%s' %s %i %i %i %s" % \
                                                       (getLogLevelString(), self.cactusDisk, self.flowerName, 
                                                       self.blastOptions.chunkSize, 
                                                       self.blastOptions.overlapSize,
                                                       self.blastOptions.minimumSequenceLength,
                                                       chunksDir)).split("\n") if chunk != "" ]
     logger.info("Broken up the flowers into individual 'chunk' files")
     self.addChildTarget(MakeBlastsAllAgainstAll(self.blastOptions, chunks, self.finalResultsFile))
Ejemplo n.º 33
0
    def testProgressiveOutgroupsVsAllOutgroups(self):
        """Tests the difference in outgroup coverage on an ingroup when
        running in "ingroups vs. outgroups" mode and "set against set"
        mode.
        """
        encodeRegion = "ENm001"
        ingroup = "human"
        outgroups = ["macaque", "rabbit", "dog"]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPath = os.path.join(regionPath, ingroup + "." + encodeRegion + ".fa")
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        # Run in "set against set" mode, aligning the entire ingroup
        # vs each outgroup
        runCactusBlast([ingroupPath], self.tempOutputFile, os.path.join(self.tempDir, "setVsSetJobTree"),
                       chunkSize=500000, overlapSize=10000,
                       targetSequenceFiles=outgroupPaths)
        # Run in "ingroup vs outgroups" mode, aligning the ingroup vs
        # the outgroups in order, trimming away sequence that's
        # already been aligned.
        system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/outgroupJobTree" % (ingroupPath, ",".join(outgroupPaths), self.tempOutputFile2, self.tempDir))

        # Get the coverage on the ingroup, in bases, from each run.
        coverageSetVsSet = int(popenCatch("cactus_coverage %s %s | awk '{ total +=  $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile)))
        coverageIngroupVsOutgroups = int(popenCatch("cactus_coverage %s %s | awk '{ total +=  $3 - $2} END { print total }'" % (ingroupPath, self.tempOutputFile2)))

        print "total coverage on human (set vs set mode, %d outgroups): %d" % (len(outgroups), coverageSetVsSet)
        print "total coverage on human (ingroup vs outgroup mode, %d outgroups): %d" % (len(outgroups), coverageIngroupVsOutgroups)

        # Make sure we're getting a reasonable fraction of the
        # alignments when using the trimming strategy.
        self.assertTrue(float(coverageIngroupVsOutgroups)/coverageSetVsSet >= 0.95)

        # Get the coverage on the ingroup, in bases, from just the
        # last outgroup. Obviously this should be much higher in set
        # vs set mode than in ingroup vs outgroup mode.
        coverageFromLastOutgroupSetVsSet = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total +=  $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile, ingroupPath)))
        coverageFromLastOutgroupInVsOut = int(popenCatch("grep %s %s | cactus_coverage %s /dev/stdin | awk '{ total +=  $3 - $2} END { print total }'" % (outgroups[-1], self.tempOutputFile2, ingroupPath)))

        print "total coverage on human from last outgroup in set (%s) (set vs set mode): %d" % (outgroups[-1], coverageFromLastOutgroupSetVsSet)
        print "total coverage on human from last outgroup in set (%s) (ingroup vs outgroup mode): %d" % (outgroups[-1], coverageFromLastOutgroupInVsOut)

        self.assertTrue(float(coverageFromLastOutgroupInVsOut)/coverageFromLastOutgroupSetVsSet <= 0.10)
Ejemplo n.º 34
0
 def testFlanking(self):
     fa = popenCatch("cactus_trimSequences.py --flanking 1 --minSize 0 --windowSize 1 --threshold 1 %s %s" % (self.faPath, self.bedPath))
     # The two blocks 0-5, 6-11 should be merged together since
     # their flanking sequence intersects. Additionally the
     # flanking sequence shouldn't go past the beginning sequence.
     self.assertTrue(dedent('''\
     >seq1|0
     CATGCATGCATG''') in fa)
     self.assertTrue(dedent('''\
     >seq1|14
     TGC''') in fa)
Ejemplo n.º 35
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                tmpToil = os.path.join(self.tempDir, "outgroupToil")
                runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    ingroupCoverage = getTempFile(rootDir=self.tempDir)
                    coverageWorkDir = getTempDirectory(rootDir=self.tempDir)
                    calculateCoverage(work_dir=coverageWorkDir, sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                    coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Ejemplo n.º 36
0
def liftoverLine(halFile, refGenome, refBedLine, targetGenome, targetSeq=None):
    """Get a list of PSL lines representing the alignment on the given bed
    line between the refGenome and targetGenome. Optionally, filter
    for only lines involving a certain sequence in targetGenome.
    """
    pslLines = popenCatch(
        "halLiftover --outPSL %s %s stdin %s stdout" % (halFile, refGenome, targetGenome), stdinString=refBedLine
    ).split("\n")
    pslLines = filter(lambda x: x != "", pslLines)
    if targetSeq is not None:
        pslLines = filter(lambda x: x.split()[13] == targetSeq, pslLines)
    return pslLines
Ejemplo n.º 37
0
def runCactusSetup(cactusDiskDatabaseString, sequences, 
                   newickTreeString, logLevel=None, outgroupEvents=None,
                   makeEventHeadersAlphaNumeric=None):
    logLevel = getLogLevelString2(logLevel)
    outgroupEvents = nameValue("outgroupEvents", outgroupEvents, str, quotes=True)
    makeEventHeadersAlphaNumeric=nameValue("makeEventHeadersAlphaNumeric", makeEventHeadersAlphaNumeric, bool)
    masterMessages = popenCatch("cactus_setup %s --speciesTree '%s' --cactusDisk '%s' \
--logLevel %s %s %s" \
           % (" ".join(sequences), newickTreeString,
              cactusDiskDatabaseString, logLevel, outgroupEvents, makeEventHeadersAlphaNumeric))
    logger.info("Ran cactus setup okay")
    return [ i for i in masterMessages.split("\n") if i != '' ]
Ejemplo n.º 38
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if len(argv) != 2:
        print "usage: runAndGetResources.py \'cmdline\'"
        exit(1)
    cmdline = argv[1]
    wallStart = time.time()
    output = popenCatch(cmdline)
    wallClock = time.time() - wallStart
    print(wallClock, ) + getTotalCpuTimeAndMemoryUsage()
    return 0
Ejemplo n.º 39
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    if len(argv) != 2:
        print "usage: runAndGetResources.py \'cmdline\'"
        exit(1)
    cmdline = argv[1]
    wallStart = time.time()
    output = popenCatch(cmdline)
    wallClock = time.time() - wallStart
    print (wallClock,) + getTotalCpuTimeAndMemoryUsage()
    return 0
Ejemplo n.º 40
0
 def testComplement(self):
     fa = popenCatch("cactus_trimSequences.py --flanking 0 --minSize 0 --windowSize 1 --threshold 1 --complement %s %s" % (self.faPath, self.bedPath))
     self.assertTrue(dedent('''\
     >seq1|5
     A''') in fa)
     self.assertTrue(dedent('''\
     >seq1|11''') in fa)
     self.assertTrue(dedent('''\
     >seq1|16''') in fa)
     # make sure the sequence that isn't covered at all is included
     self.assertTrue(dedent('''\
     >seq2|0''') in fa)
Ejemplo n.º 41
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegion = "ENm001"
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        MAX_NUM_OUTGROUPS = 3
        # subselect a random set of outgroups in the same order
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), MAX_NUM_OUTGROUPS))]
        regionPath = os.path.join(self.encodePath, encodeRegion)
        ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
        outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
        results = []
        for numOutgroups in xrange(1, len(outgroups) + 1):
            # Align w/ increasing numbers of outgroups
            subResults = getTempFile()
            subOutgroupPaths = outgroupPaths[:numOutgroups]
            print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
            tmpToil = os.path.join(self.tempDir, "outgroupToil")
            runCactusBlastIngroupsAndOutgroups(ingroupPaths, subOutgroupPaths, alignmentsFile=subResults, toilDir=tmpToil)
            results.append(subResults)

        # Print diagnostics about coverage
        for i, subResults in enumerate(results):
            for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                ingroupCoverage = getTempFile(rootDir=self.tempDir)
                calculateCoverage(sequenceFile=ingroupPath, cigarFile=subResults, outputFile=ingroupCoverage)
                coveredBases = popenCatch("cat %s | awk '{ total += $3 - $2 } END { print total }'" % ingroupCoverage)
                print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

        resultsSets = map(lambda x : loadResults(x), results)
        for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
            # Make sure the results from (n+1) outgroups are
            # (very nearly) a superset of the results from n outgroups
            print "Using %d addl outgroup(s):" % (i + 1)
            comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
            print comparator
            self.assertTrue(comparator.sensitivity >= 0.99)

        # Ensure that the new alignments don't cover more than
        # x% of already existing alignments to human
        for i in xrange(1, len(resultsSets)):
            prevResults = resultsSets[i-1][0]
            curResults = resultsSets[i][0]
            prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
            newAlignments = curResults.difference(prevResults)
            newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
            print "addl outgroup %d:" % i
            print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
        for subResult in results:
            os.remove(subResult)
Ejemplo n.º 42
0
    def testAddingOutgroupsImprovesResult(self):
        """Run blast on "ingroup" and "outgroup" encode regions, and ensure
        that adding an extra outgroup only adds alignments if
        possible, and doesn't lose any
        """
        encodeRegions = [ "ENm00" + str(i) for i in xrange(1,2) ]
        ingroups = ["human", "macaque"]
        outgroups = ["rabbit", "dog", "rat", "platypus", "xenopus", "fugu"]
        # subselect 4 random ordered outgroups
        outgroups = [outgroups[i] for i in sorted(random.sample(xrange(len(outgroups)), 4))]
        for encodeRegion in encodeRegions:
            regionPath = os.path.join(self.encodePath, encodeRegion)
            ingroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), ingroups)
            outgroupPaths = map(lambda x: os.path.join(regionPath, x + "." + encodeRegion + ".fa"), outgroups)
            results = []
            for numOutgroups in xrange(1,5):
                # Align w/ increasing numbers of outgroups
                subResults = getTempFile()
                subOutgroupPaths = outgroupPaths[:numOutgroups]
                tmpJobTree = getTempDirectory()
                print "aligning %s vs %s" % (",".join(ingroupPaths), ",".join(subOutgroupPaths))
                system("cactus_blast.py --ingroups %s --outgroups %s --cigars %s --jobTree %s/jobTree" % (",".join(ingroupPaths), ",".join(subOutgroupPaths), subResults, tmpJobTree))
                system("rm -fr %s" % (tmpJobTree))
                results.append(subResults)

            # Print diagnostics about coverage
            for i, subResults in enumerate(results):
                for ingroup, ingroupPath in zip(ingroups, ingroupPaths):
                    coveredBases = popenCatch("cactus_coverage %s %s | awk '{ total += $3 - $2 } END { print total }'" % (ingroupPath, subResults))
                    print "covered bases on %s using %d outgroups: %s" % (ingroup, i + 1, coveredBases)

            resultsSets = map(lambda x : loadResults(x), results)
            for i, moreOutgroupsResults in enumerate(resultsSets[1:]):
                # Make sure the results from (n+1) outgroups are
                # (very nearly) a superset of the results from n outgroups
                print "Using %d addl outgroup(s):" % (i + 1)
                comparator =  ResultComparator(resultsSets[0], moreOutgroupsResults)
                print comparator
                self.assertTrue(comparator.sensitivity >= 0.99)

            # Ensure that the new alignments don't cover more than
            # x% of already existing alignments to human
            for i in xrange(1, len(resultsSets)):
                prevResults = resultsSets[i-1][0]
                curResults = resultsSets[i][0]
                prevResultsHumanPos = set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], prevResults)))
                newAlignments = curResults.difference(prevResults)
                newAlignmentsHumanPos =  set(map(lambda x: (x[0], x[1]) if "human" in x[0] else (x[2], x[3]), filter(lambda x: "human" in x[0] or "human" in x[2], newAlignments)))
                print "addl outgroup %d:" % i
                print "bases re-covered: %f (%d)" % (len(newAlignmentsHumanPos.intersection(prevResultsHumanPos))/float(len(prevResultsHumanPos)), len(newAlignmentsHumanPos.intersection(prevResultsHumanPos)))
            for subResult in results:
                os.remove(subResult)
Ejemplo n.º 43
0
 def testSimplestParameters(self):
     # Test w/ no windowing, minimum size, etc to see if bed
     # import/fasta export works
     fa = popenCatch("cactus_trimSequences.py --flanking 0 --minSize 0 --windowSize 1 --threshold 1 %s %s" % (self.faPath, self.bedPath))
     self.assertTrue(dedent('''\
     >seq1|0
     CATGC''') in fa)
     self.assertTrue(dedent('''\
     >seq1|6
     TGCAT''') in fa)
     self.assertTrue(dedent('''\
     >seq1|15
     G''') in fa)
Ejemplo n.º 44
0
def getBedLineForSequence(halFile, genome, sequence):
    """Get a bed line from the beginning to the end of a given
    sequence."""
    bedLines = popenCatch(
        "halStats --bedSequences %s %s" % (genome, halFile)).split("\n")
    seqLines = filter(lambda x: x[0] == sequence, [line.split() for line in bedLines if line != ""])
    if len(seqLines) > 1:
        raise RuntimeError("More than one sequence named %s in genome %s, "
                           "aborting!" % (sequence, genome))
    elif len(seqLines) == 0:
        raise RuntimeError("No sequence named %s found in genome %s" % (sequence,
                                                                        genome))
    return "\t".join(seqLines[0])
Ejemplo n.º 45
0
def liftoverLine(halFile, refGenome, refBedLine, targetGenome, targetSeq=None):
    """Get a list of PSL lines representing the alignment on the given bed
    line between the refGenome and targetGenome. Optionally, filter
    for only lines involving a certain sequence in targetGenome.
    """
    pslLines = popenCatch("halLiftover --outPSL %s %s stdin %s stdout" % \
                          (halFile,
                           refGenome,
                           targetGenome),
                          stdinString=refBedLine).split("\n")
    pslLines = filter(lambda x: x != "", pslLines)
    if targetSeq is not None:
        pslLines = filter(lambda x: x.split()[13] == targetSeq, pslLines)
    return pslLines
Ejemplo n.º 46
0
def realign(fasta, ref):
    '''
    :param fasta: genome fasta files
    :param ref: reference species
    '''
    seqFile1 = fasta.get(ref)
    for genome in fasta:
        if genome == ref:
            continue
        axt = genome + '.chained.axt'
        with open(axt) as infile:
            block = []
            for line in infile:
                if line:
                    col = line.rstrip().split()
                    block.append(col)
                    if len(block) == 3:
                        cig = axtToCigar(block)
                        seqFile2 = fasta.get(genome)
                        realignCommand = "echo '%s' | cPecanRealign %s %s %s" % (cig, "-u /dev/stdout", seqFile1, seqFile2)
                        print popenCatch(realignCommand).split("\n")
                else:
                    block = []
Ejemplo n.º 47
0
def getBedLineForSequence(halFile, genome, sequence):
    """Get a bed line from the beginning to the end of a given
    sequence."""
    bedLines = popenCatch("halStats --bedSequences %s %s" %
                          (genome, halFile)).split("\n")
    seqLines = filter(lambda x: x[0] == sequence,
                      [line.split() for line in bedLines if line != ""])
    if len(seqLines) > 1:
        raise RuntimeError("More than one sequence named %s in genome %s, "
                           "aborting!" % (sequence, genome))
    elif len(seqLines) == 0:
        raise RuntimeError("No sequence named %s found in genome %s" %
                           (sequence, genome))
    return "\t".join(seqLines[0])
Ejemplo n.º 48
0
 def testCPecanRealignRescoreByIdentityAndProb(self):
     """Runs cactus realign using the default parameters and checks that the realigned output cigars align 
     the same subsequences.
     """
     for seqFile1, seqFile2 in seqFilePairGenerator():
         realignCommandByIdentity, lastzCommand = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByIdentity")
         realignCommandByPosteriorProb = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByPosteriorProb")[0]
         realignCommandByIdentityIgnoringGaps = getCommands(seqFile1, seqFile2, realignArguments="--rescoreByIdentityIgnoringGaps")[0]
         for realignLineByIdentity, realignLineByPosteriorProb, realignLineByIdentityIgnoringGaps, lastzLine in \
                                       zip([ i for i in popenCatch(realignCommandByIdentity).split("\n") if i != '' ], \
                                           [ i for i in popenCatch(realignCommandByPosteriorProb).split("\n") if i != '' ], \
                                           [ i for i in popenCatch(realignCommandByIdentityIgnoringGaps).split("\n") if i != '' ], \
                                           [ i for i in popenCatch(lastzCommand).split("\n") if i != '' ]):
             realignCigarByIdentity = cigarReadFromString(realignLineByIdentity)
             realignCigarByPosteriorProb = cigarReadFromString(realignLineByPosteriorProb)
             realignCigarByIdentityIgnoringGaps = cigarReadFromString(realignLineByIdentityIgnoringGaps)
             lastzCigar = cigarReadFromString(lastzLine)
             #Check scores are as expected
             self.assertTrue(realignCigarByIdentity.score >= 0)
             self.assertTrue(realignCigarByIdentity.score <= 100.0)
             self.assertTrue(realignCigarByPosteriorProb.score >= 0)
             self.assertTrue(realignCigarByPosteriorProb.score <= 100.0)
             self.assertTrue(realignCigarByIdentityIgnoringGaps.score >= 0)
             self.assertTrue(realignCigarByIdentityIgnoringGaps.score <= 100.0)
Ejemplo n.º 49
0
 def run(self):
     # Find all ancestral genomes using the tree.
     newickStr = popenCatch("halStats --tree %s" % self.halFile)
     tree = NXNewick().parseString(newickStr)
     bedFiles = {} # genome => bed files of inserted columns
     for nodeId in tree.postOrderTraversal():
         if len(tree.getChildren(nodeId)) == 0:
             # leaf node, skip
             continue
         assert tree.hasName(nodeId)
         genome = tree.getName(nodeId)
         bedFileForGenome = getTempFile(rootDir=self.getGlobalTempDir())
         bedFiles[genome] = bedFileForGenome
         self.addChildTarget(GetInsertedColumnBed(self.halFile, genome, bedFileForGenome))
     self.setFollowOnTarget(RunAncestorsMLParallel(self.halFile, self.phyloPModel, bedFiles, self.jobsPerGenome, self.threshold))
def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta):
    """
    Main consensus alignment function.
    """
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    tx_seq = str(ref_tx_fasta[gp.name])
    fastaWrite(tmp_ref, gp.name, tx_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    best_cov, best_ident = evaluate_blat_results(r)
    return map(str, [gp.id, gp.name, best_cov, best_ident])
Ejemplo n.º 51
0
 def sanityCheckSequence(self, path):
     """Warns the user about common problems with the input sequences."""
     # Relies on cactus_analyseAssembly output staying in the
     # format it's currently in.
     return
     cmdline = "cactus_analyseAssembly"
     if os.path.isdir(path):
         cmdline = "cat %s/* | %s -" % (path, cmdline)
     else:
         cmdline += " %s" % path
     output = popenCatch(cmdline)
     try:
         repeatMaskedFrac = float(
             re.search(r'Proportion-repeat-masked: ([0-9.]*)',
                       output).group(1))
         nFrac = float(
             re.search(r'ProportionNs: ([0-9.]*)', output).group(1))
     except ValueError:
         # This can happen if the genome has 0 length, making the fractions NaN.
         # We warn the user but return afterwards, as the rest of the checks are
         # dependent on the fraction values.
         sys.stderr.write(
             "WARNING: sequence path %s has 0 length. Consider "
             "removing it from your input file.\n\n" % path)
         return
     # These thresholds are pretty arbitrary, but should be good for
     # badly- to well-assembled vertebrate genomes.
     if repeatMaskedFrac > 0.70:
         sys.stderr.write(
             "WARNING: sequence path %s has an extremely high "
             "proportion of masked bases: %f. progressiveCactus"
             " expects a soft-masked genome, i.e. all lowercase"
             " characters are considered masked. The process "
             "will proceed normally, but make sure you haven't "
             "accidentally provided an all-lowercase genome, "
             "in which case nothing will be aligned to "
             "it!\n\n" % (path, repeatMaskedFrac))
     if nFrac > 0.30:
         sys.stderr.write("WARNING: sequence path %s has an extremely high "
                          "proportion of 'N' bases: %f. The process will "
                          "proceed normally, but make sure your genome "
                          "isn't hard-masked! Alignments to hard-masked "
                          "genomes are much worse than to soft-masked "
                          "genomes. If the genome just has a lot of "
                          "poorly assembled regions, feel free to "
                          "ignore this message.\n\n" % (path, nFrac))
Ejemplo n.º 52
0
def find_single_copy(target, args, chunk, result_path):
    """
    Score each region for percent single copyness
    """
    with open(result_path, 'w') as outf:
        for chrom, start, stop in chunk:
            start = int(start)
            stop = int(stop)
            length = stop - start
            cmd = 'halSingleCopyRegionsExtract {} {} --refSequence {} --start {} --length {}'
            cmd = cmd.format(args.hal, args.ref_genome, chrom, start, length)
            r = popenCatch(cmd)
            r = r.split('\n')[:-1]
            tot = 0
            for l in r:
                l = l.split()
                tot += int(l[-1]) - int(l[-2])
            outf.write('\t'.join(
                map(str, [chrom, start, stop,
                          format_ratio(tot, length)])) + '\n')
Ejemplo n.º 53
0
    def run(self):
        genomes = popenCatch("halStats --genomes %s" %
                             self.opts.halPath).split()
        # main outputs, entirely inserted sequence outputs, total inserted bases outputs
        outputss = [[], [], []]
        for genome in genomes:
            # Get a temp file to hold the genome's output, which will
            # be concatenated with the others at the end
            tempOutput = getTempFile(rootDir=self.getGlobalTempDir())
            outputss[0].append(tempOutput)

            # Create a temp file to hold entirely inserted seqs, if needed
            tempEntirelyInsertedSequencesPath = None
            if self.opts.entirelyInsertedSequencesPath is not None:
                tempEntirelyInsertedSequencesPath = getTempFile(
                    rootDir=self.getGlobalTempDir())
                outputss[1].append(tempEntirelyInsertedSequencesPath)

            # Create a temp file to hold total inserted bases, if needed
            tempTotalInsertedBasesPath = None
            if self.opts.totalInsertedBasesPath is not None:
                tempTotalInsertedBasesPath = getTempFile(
                    rootDir=self.getGlobalTempDir())
                outputss[2].append(tempTotalInsertedBasesPath)

            self.addChildTarget(
                ExtractInsertions(self.opts.halPath, genome, tempOutput,
                                  self.opts.samplePerGenome, self.opts.samples,
                                  self.opts.noGaps,
                                  tempEntirelyInsertedSequencesPath,
                                  tempTotalInsertedBasesPath))
        self.setFollowOnTarget(
            ReduceOutputs(outputss, [
                self.opts.output, self.opts.entirelyInsertedSequencesPath,
                self.opts.totalInsertedBasesPath
            ], [not self.opts.samplePerGenome, False, False],
                          [self.opts.samples, None, None], [
                              'insertionSize\tgenome\tseq\tmaskedBases',
                              'insertionSize\tgenome\tseq\tmaskedBases',
                              'genome\ttotalInsertedBases'
                          ]))
Ejemplo n.º 54
0
 def run(self):
     outputsPerGenome = {}
     for genome, bedFile in self.bedFileDict.items():
         outputsPerGenome[genome] = []
         numLines = int(popenCatch("wc -l %s | cut -d' ' -f 1" % bedFile))
         linesPerJob = int(math.ceil(float(numLines) / self.jobsPerGenome))
         if linesPerJob == 0:
             linesPerJob = 1
         for start in xrange(0, numLines, linesPerJob):
             end = start + linesPerJob
             if end > numLines:
                 end = numLines
             bedForJob = getTempFile(rootDir=self.getGlobalTempDir())
             system("head -n %d %s | tail -n %d > %s" %
                    (start + linesPerJob, bedFile, end - start, bedForJob))
             output = getTempFile(rootDir=self.getGlobalTempDir())
             self.addChildTarget(
                 RunAncestorsML(self.halFile, genome, bedForJob,
                                self.phyloPModel, output))
             outputsPerGenome[genome].append(output)
     self.setFollowOnTarget(WriteNucleotides(outputsPerGenome,
                                             self.halFile))
Ejemplo n.º 55
0
def align(target, g, target_fasta, chunk, ref_fasta, out_path):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    for aug_aId in chunk:
        aId = remove_augustus_alignment_number(aug_aId)
        gencode_id = remove_alignment_number(aId)
        gencode_seq = str(r_f[gencode_id])
        aug_seq = str(g_f[aug_aId])
        tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug")
        tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode")
        fastaWrite(tmp_aug, aug_aId, aug_seq)
        fastaWrite(tmp_gencode, gencode_id, gencode_seq)
        r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug))
        r = r.split("\n")[:-3]
        if len(r) == 0:
            results.append([aug_aId, "0", "0"])
        else:
            p_list = [PslRow(x) for x in r]
            results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)]))
    with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf:
        for x in results:
            outf.write("\t".join(x) + "\n")
def align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict, ref_tx_fasta):
    """
    Main CGP alignment function. For each CGP transcript, uses tx_dict to BLAT against all transcripts. These alignments
    are then chained and the highest coverage alignment used. This circumvents problems with multiple self alignments
    in the case of repeats.
    """
    results = []
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    for gene_name, tx_names in tx_dict.iteritems():
        for tx_name in tx_names:
            tx_seq = str(ref_tx_fasta[tx_name])
            fastaWrite(tmp_ref, tx_name, tx_seq)
            system("blat {} {} -out=psl -noHead {}".format(
                tmp_tgt, tmp_ref, tmp_psl))
            r = popenCatch(
                "simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
            r = r.split("\n")[:-1]
            best_cov, best_ident = evaluate_blat_results(r)
            results.append(
                map(str, [gp.name, gene_name, tx_name, best_cov, best_ident]))
    return results