Example #1
0
def computeTreePhyloP(args):
    visitQueue = [args.root]
    bigwigCmds = []
    while len(visitQueue) > 0:
        genome = visitQueue.pop()
        bedFlags = ""
        # Generate a bed file of all regions of 
        # genome that dont align to parent
        bedInsertsFile = outFileName(args, genome, "bed", "inserts", True)
        if genome != args.root:
            runShellCommand(
            "halAlignedExtract %s %s --alignedFile %s --complement" % (
                args.hal, genome, bedInsertsFile))
            bedFlags = "--refBed %s" % bedInsertsFile

        # Run halPhyloP on the inserts
        wigFile = outFileName(args, genome, "wig", "phyloP", False)
        cmd = "halPhyloPMP.py %s %s %s %s --numProc %d %s" % (
            args.hal, genome, args.mod, bedFlags, args.numProc, wigFile)
        if args.subtree is not None:
            cmd += " --subtree %s" % args.subtree
        if args.prec is not None:
            cmd += " --prec %d" % args.prec

        runShellCommand(cmd)
    
        runShellCommand("rm -f %s" % bedInsertsFile)

        # Lift down from the parent, appending to the wig file computed above
        if genome != args.root:
            parent = getHalParentName(args.hal, genome)
            parentWig = outFileName(args, parent, "wig", "phyloP", False)
            if os.path.isfile(parentWig):
                runShellCommand("halWiggleLiftover %s %s %s %s %s --append" % (
                    args.hal, parent, parentWig, genome, wigFile))

        # Convert to bigwig if desired and delete wig file
        if args.bigWig is True and os.path.isfile(wigFile):
            sizesFile = outFileName(args, genome, "sizes", "chr", True)
            bwFile = outFileName(args, genome, "bw", "phyloP", False)
            bwCmd = "halStats %s --chromSizes %s > %s && " % (args.hal, genome,
                                                              sizesFile)
            bwCmd += "wigToBigWig %s %s %s && " % (wigFile, sizesFile, bwFile)
            bwCmd += "rm -f %s &&" % wigFile
            bwCmd += "rm -f %s" % sizesFile
            bigwigCmds.append(bwCmd)

        # Recurse on children.
        children = getHalChildrenNames(args.hal, genome)
        for child in children:
            visitQueue.append(child)

    #parallel bigwig conversion
    runParallelShellCommands(bigwigCmds, args.numProc)
Example #2
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Example #3
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Example #4
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:   
        refStat = [x for x in refSequenceStats if x[1] == 
                   options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" % (
                options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)
    
    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Example #5
0
def createLods(halPath, outLodPath, outDir, maxBlock, scale, overwrite,
               maxDNA, absPath, trans, inMemory, probeFrac, minSeqFrac,
               scaleCorFac, numProc, chunk, minLod0, cutOff, minCovFrac):
    lodFile = open(outLodPath, "w")
    lodFile.write("0 %s\n" % formatOutHalPath(outLodPath, halPath, absPath))
    steps, lastIsMax = getSteps(halPath, maxBlock, scale, minLod0, cutOff,
                                minSeqFrac, minCovFrac)
    curStepFactor = scaleCorFac
    lodExtractCmds = []
    prevStep = None
    for stepIdx in xrange(1,len(steps)):
        step = int(max(1, steps[stepIdx] * curStepFactor))
        maxQueryLength = maxBlock * steps[stepIdx - 1]
        keepSequences = maxQueryLength <= maxDNA
        #we no longer pass the step to the halLodExtract executable,
        #rather we give the corresponding the scale factor and let
        #the step get computed for each internal node (instead of using the step
        #here which is a global minimum
        stepScale = (scale ** stepIdx) * curStepFactor
        outHalPath = makePath(halPath, outDir, step, "lod", "hal")
        srcPath = halPath
        if trans is True and stepIdx > 1:
            srcPath = makePath(halPath, outDir, prevStep, "lod", "hal")
        isMaxLod = stepIdx == len(steps) - 1 and lastIsMax is True
        if not isMaxLod and (overwrite is True or
                             not os.path.isfile(outHalPath)):
            lodExtractCmds.append(
                getHalLodExtractCmd(srcPath, outHalPath, stepScale,
                                    keepSequences, inMemory, probeFrac,
                                    minSeqFrac, chunk, minCovFrac))
        lodPath =  formatOutHalPath(outLodPath, outHalPath, absPath)
        if isMaxLod:
            lodPath = MaxLodToken
        
        lodFile.write("%d %s\n" % (maxQueryLength, lodPath))

        if prevStep > steps[-1]:
            break
        prevStep = step
        curStepFactor *= scaleCorFac
    lodFile.close()
    runParallelShellCommands(lodExtractCmds, numProc)
Example #6
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:
        refStat = [x for x in refSequenceStats if x[1] == options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" %
                               (options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)

    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Example #7
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    options.smallFile = False
    options.firstSmallFile = True
    if options.refTargets:
        sliceCmds, sliceOpts = partitionRefTargets(options)
    elif options.splitBySequence is True or options.refSequence is not None:
        sliceCmds, sliceOpts = partitionBySeqCoords(options, refGenome)
    else:
        sliceCmds, sliceOpts = partitionByGenomeCoords(options, refGenome)

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # clean up temporary bed files (if present)
    for opts in sliceOpts:
        if opts.refTargets and os.path.isfile(opts.refTargets):
            os.remove(opts.refTargets)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)