Esempio n. 1
0
def getSteps(halPath, maxBlock, scaleFactor, minLod0, cutOffFrac, minSeqFrac,
            minCovFrac):
    statsTable = getHalStats(halPath)
    sequenceStatsTable = dict()
    for row in statsTable:
        sequenceStatsTable[row[0]] = getHalSequenceStats(halPath, row[0])
    maxLen = getMaxGenomeLength(statsTable)
    assert maxLen > 0
    maxStep = math.ceil(float(maxLen) / float(maxBlock))
    lodBaseStep = math.ceil(float(minLod0) / float(maxBlock))
    baseStep = max(lodBaseStep, getMinAvgBlockSize(statsTable))
    outList = []
    step = baseStep
    # last LOD is just "max" token which tells browser it and anything
    # beyond is disabled.
    lastIsMax = False
    while True:
        outList.append(step)
        if step > maxStep * cutOffFrac:
            break
        minCoverage = 1.0
        if minSeqFrac > 0. and minCovFrac > 0.:
            minCoverageFrac = getMinCoverageFrac(sequenceStatsTable,
                                                 math.floor(step * minSeqFrac))
            if minCoverageFrac < minCovFrac:
                lastIsMax = True
                break
        step *= scaleFactor
    return [int(x) for x in outList], lastIsMax
Esempio n. 2
0
def partitionByGenomeCoords(options, refGenome):
    "we are slicing the gnome coordinates directly"
    sliceCmds = []
    sliceOpts = []
    seqOpts = copy.deepcopy(options)
    assert seqOpts.splitBySequence is False
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    # auto compute slice size from numprocs
    if seqOpts.sliceSize is None and seqOpts.numProc > 1:
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        else:
            # use median of sequence lengths
            refLen = int(statistics.median([r[1] for r in refSequenceStats]))
        seqOpts.sliceSize = math.ceil(math.ceil(refLen / seqOpts.numProc))

    for refSeqStat in refSequenceStats:
        seqOpts.refSequence = refSeqStat[0]
        for sStart, sLen, sIdx in computeSlices(seqOpts, refSeqStat[1]):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))
    return sliceCmds, sliceOpts
Esempio n. 3
0
def writeChromSizes(options):
    if options.chromSizes is not None:
        csFile = open(options.chromSizes, "w")
        refSequenceStats = getHalSequenceStats(options.halFile,
                                               options.refGenome)
        assert refSequenceStats is not None
        for seqStat in refSequenceStats:
            csFile.write("%s\t%s\n" % (seqStat[0], seqStat[1]))
        csFile.close()
Esempio n. 4
0
def writeChromSizes(options):
    if options.chromSizes is not None:
        csFile = open(options.chromSizes, "w")
        refSequenceStats = getHalSequenceStats(options.halFile, 
                                               options.refGenome)
        assert refSequenceStats is not None
        for seqStat in refSequenceStats:
            csFile.write("%s\t%s\n" % (seqStat[0], seqStat[1]))
        csFile.close()
Esempio n. 5
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Esempio n. 6
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    # we are going to deal with sequence coordinates
    if options.splitBySequence is True or options.refSequence is not None:
        for sequence, seqLen, nt, nb in refSequenceStats:
            if options.refSequence is None or sequence == options.refSequence:
                seqOpts = copy.deepcopy(options)
                if seqLen < options.smallSize:
                    seqOpts.smallFile = True
                seqOpts.refGenome = refGenome
                seqOpts.refSequence = sequence
                index = 0
                for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                    seqOpts.start = sStart
                    seqOpts.length = sLen
                    seqOpts.sliceNumber = sIdx
                    sliceCmds.append(getHal2MafCmd(seqOpts))
                    sliceOpts.append(copy.deepcopy(seqOpts))
                if seqOpts.smallFile is True and seqLen > 0:
                    options.firstSmallFile = False
    # we are slicing the gnome coordinates directly
    else:
        seqOpts = copy.deepcopy(options)
        assert seqOpts.splitBySequence is False
        genomeLen = getHalGenomeLength(seqOpts.halFile, refGenome)
        # auto compute slice size from numprocs
        if seqOpts.sliceSize == None and seqOpts.numProc > 1:
            refLen = genomeLen
            if seqOpts.length is not None and seqOpts.length > 0:
                refLen = seqOpts.length
            seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

        index = 0
        for sStart, sLen, sIdx in computeSlices(seqOpts, genomeLen):
            seqOpts.start = sStart
            seqOpts.length = sLen
            seqOpts.sliceNumber = sIdx
            sliceCmds.append(getHal2MafCmd(seqOpts))
            sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)
Esempio n. 7
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:   
        refStat = [x for x in refSequenceStats if x[1] == 
                   options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" % (
                options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)
    
    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))
                
    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))
            
    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Esempio n. 8
0
def runParallelSlices(options):
    refGenome = options.refGenome
    if refGenome is None:
        refGenome = getHalRootName(options.halFile)
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    options.smallFile = False
    options.firstSmallFile = True
    sliceCmds = []
    sliceOpts = []
    if options.refSequence is not None:
        refStat = [x for x in refSequenceStats if x[1] == options.refSequence]
        if len(refStat != 1):
            raise RuntimeError("Sequence %s not found in genome %s" %
                               (options.refSequence, options.refGenome))
        totalLength = int(refStat[1])
    else:
        totalLength = getHalGenomeLength(options.halFile, refGenome)

    seqOpts = copy.deepcopy(options)

    # auto compute slice size from numprocs
    if seqOpts.sliceSize == None and seqOpts.numProc > 1:
        refLen = totalLength
        if seqOpts.length is not None and seqOpts.length > 0:
            refLen = seqOpts.length
        seqOpts.sliceSize = int(math.ceil(refLen / seqOpts.numProc))

    index = 0
    for sStart, sLen, sIdx in computeSlices(seqOpts, totalLength):
        seqOpts.start = sStart
        seqOpts.length = sLen
        seqOpts.sliceNumber = sIdx
        sliceCmds.append(getHalPhyloPCmd(seqOpts))
        sliceOpts.append(copy.deepcopy(seqOpts))

    # run in parallel
    runParallelShellCommands(sliceCmds, options.numProc)

    # concatenate into output if desired
    concatenateSlices(sliceOpts, sliceCmds)

    writeChromSizes(options)
Esempio n. 9
0
def partitionBySeqCoords(options, refGenome):
    "we are going to deal with sequence coordinates"
    refSequenceStats = getHalSequenceStats(options.halFile, refGenome)
    sliceCmds = []
    sliceOpts = []
    for sequence, seqLen, nt, nb in refSequenceStats:
        if options.refSequence is None or sequence == options.refSequence:
            seqOpts = copy.deepcopy(options)
            if seqLen < options.smallSize:
                seqOpts.smallFile = True
            seqOpts.refGenome = refGenome
            seqOpts.refSequence = sequence
            for sStart, sLen, sIdx in computeSlices(seqOpts, seqLen):
                seqOpts.start = sStart
                seqOpts.length = sLen
                seqOpts.sliceNumber = sIdx
                sliceCmds.append(getHal2MafCmd(seqOpts))
                sliceOpts.append(copy.deepcopy(seqOpts))
            if seqOpts.smallFile is True and seqLen > 0:
                options.firstSmallFile = False
    return sliceCmds, sliceOpts