Esempio n. 1
0
def FetchPatternWithICIter(allNode,
                           adjMap,
                           minIC,
                           maxIC,
                           seqLis,
                           allPattern,
                           kmer2seqIdSet,
                           overallKmer2Cov,
                           pattern2IC,
                           startNode2rlt={},
                           ICminInterval=0.05,
                           enableParallel=False,
                           coreNm=8,
                           pattern2kmerSet={}):
    Comm.PrintWithTime('current IC: min: %s, max: %s' % (minIC, maxIC),
                       isLogging=True)

    # build the backpack model
    bp = Backpack(adjMap, minIC, maxIC, seqLis, kmer2seqIdSet, overallKmer2Cov,
                  pattern2IC, pattern2kmerSet)

    # give different start node in case that there are multiple motif
    patternSet = set()
    ICLis = []

    if not enableParallel:
        for startNode in allNode:
            includedNodes = {startNode}
            availNodes = adjMap[startNode]
            disableNode = set()
            userCov, curPattern = startNode2rlt[
                startNode] if startNode in startNode2rlt and minIC <= startNode2rlt[
                    startNode][0] <= maxIC else bp.searchPattern(
                        includedNodes, availNodes, disableNode)
            startNode2rlt[startNode] = (userCov, curPattern)
            # INFO('current starting node: %s, current pattern: %s' % (startNode, curPattern))
            if not curPattern: continue

            # filter 'ACGU' and 'X[ACGU]X'
            parsedPatternLis = BioinfoComm.parsePatternStr(curPattern)
            charCntLis = map(lambda x: len(x), parsedPatternLis)
            if set(charCntLis) == {1} or 4 in charCntLis: continue

            # ==== TODO
            # kmerLis = BioinfoComm.FetchAllKmerFromPattern(curPattern)
            # patternLen = len(kmerLis[0])
            # if patternLen == 4:
            # if patternLen == 6:
            #     curPattern = '[GT]G[ACG][CG]'
            #     curPattern = '[CGT]CC[AGT]'
            # curPattern = '[AGT][AGT][AGT][ACG][AT][CG]'
            # ====

            # calculate new information content
            curIC, bp.pattern2IC = FetchPatternWeighedIC(
                curPattern, bp.pattern2IC, overallKmer2Cov)
            ICLis.append(curIC)
            patternSet.add(curPattern)
    else:
        startNodeInfoLis = Parallel(n_jobs=coreNm)(delayed(
            ParallelDiffStartnode)(startNode, bp, adjMap, startNode2rlt, minIC,
                                   maxIC, overallKmer2Cov)
                                                   for startNode in allNode)
        for startNode, curIC, curPattern, userCov, curPattern2IC in startNodeInfoLis:
            if curIC == -1: continue
            pattern2IC = dict(pattern2IC, **curPattern2IC)
            startNode2rlt[startNode] = (userCov, curPattern)
            ICLis.append(curIC)
            patternSet.add(curPattern)

    kmer2seqIdSet = bp.kmer2seqIdSet
    pattern2kmerSet = bp.pattern2kmerSet
    # pattern2IC = bp.pattern2IC
    if not patternSet:
        return allPattern, kmer2seqIdSet, pattern2IC, pattern2kmerSet  # in case there is no result
    allPattern = allPattern | patternSet
    nextIC = min(ICLis)

    # the left part
    minIC1 = minIC
    maxIC1 = min(nextIC, maxIC - 0.05)  # at least move the bound for 0.05
    if maxIC1 < maxIC and maxIC1 - minIC1 > ICminInterval:
        allPattern, kmer2seqIdSet, pattern2IC, pattern2kmerSet = FetchPatternWithICIter(
            allNode,
            adjMap,
            minIC1,
            maxIC1,
            seqLis,
            allPattern,
            kmer2seqIdSet,
            overallKmer2Cov,
            pattern2IC,
            startNode2rlt,
            enableParallel=enableParallel,
            coreNm=coreNm,
            pattern2kmerSet=pattern2kmerSet)

    # the right part
    minIC2 = max(nextIC, minIC + 0.05)  # at least move the bound for 0.05
    maxIC2 = maxIC
    if minIC2 > minIC and maxIC2 - minIC2 > ICminInterval:
        allPattern, kmer2seqIdSet, pattern2IC, pattern2kmerSet = FetchPatternWithICIter(
            allNode,
            adjMap,
            minIC2,
            maxIC2,
            seqLis,
            allPattern,
            kmer2seqIdSet,
            overallKmer2Cov,
            pattern2IC,
            startNode2rlt,
            enableParallel=enableParallel,
            coreNm=coreNm,
            pattern2kmerSet=pattern2kmerSet)

    INFO('iteration for IC [%s, %s]' % (minIC, maxIC))

    return allPattern, kmer2seqIdSet, pattern2IC, pattern2kmerSet