Exemple #1
0
def adjListToRepeatList(newAdjacencyList, folderName, repeatFilename):

    N1 = len(newAdjacencyList)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
def adjListToRepeatList(newAdjacencyList,folderName,repeatFilename):

    N1 = len(newAdjacencyList)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
    
Exemple #4
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "


    
    
    # 0. Load previous data
    G = abunGraphLib.seqGraphWt(0)
    G.loadFromFile(folderName, contigReadGraph)
    Grev = abunGraphLib.formReverseGraph(G)
    
    json_data = open(folderName + repeatFilename, 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
     
    bigDumpList = []
    
    print "len(repeatList)", len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1):
            print rIn, rOut
            if  (len(rIn) == 1 and len(rOut) == 1):
                rIn = [rIn[0], rIn[0]]
                rOut = [rOut[0], rOut[0]]
            
            # 1. Records reachable indices
            kkIn , kkOut = [], []
            for eachkk in rIn:
                kkIn.append(str(eachkk)+"_"+"in")
            
            for eachkk in rOut:
                kkOut.append(str(eachkk)+"_"+"out")
                
            
            abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            #checkPathLength(repeatPathway, G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList)
            
            # ## Experimental
            repeatList = allPassList
            
            # ## End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
        

     


    # 7. Format return and move on to the phasing 
    with open(folderName + repeatSpec, 'w') as outfile:
        json.dump(bigDumpList, outfile)
Exemple #6
0
def BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic,
                mummerLink):

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve:
        print "abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve", abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve
        maxRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.RThres

        repeatFinder.adjListToRepeatList(Gnew.adj, folderName,
                                         "phaseRepeatTR.txt")

        json_data = open(folderName + "phaseRepeatTR.txt", 'r')
        repeatPairs = json.load(json_data)

        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []

        G = abunGraphLib.seqGraphWt(0)
        G.loadFromFile(folderName, contigReadGraph)

        Grev = abunGraphLib.formReverseGraphFast(G)

        abunAnalysisList = []

        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            if not abunHouseKeeper.abunGlobalRunEM:
                resolvedList, brResolvedList = [], []

                if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunB:
                    if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB > 0:
                        abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB

                    if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB > 0:
                        abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB

                    if not abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAggB:
                        resolvedList = determindMatch(inList, outList,
                                                      myCountDic, folderName,
                                                      contigReadGraph, N1)
                    else:

                        resolvedList = determindMatchAggregate(
                            inList, outList, myCountDic, folderName,
                            contigReadGraph, N1, Gnew, lenDic)

                if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRB:
                    if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB > 0:
                        abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB

                    brResolvedList = formBRReolve(folderName, inList, outList,
                                                  G, Grev, True, N1)

                combinedList = abunHouseKeeper.getDistinct(resolvedList +
                                                           brResolvedList)

                print "resolvedList, brResolvedList, inList, outList", resolvedList, brResolvedList, inList, outList

                print "resolveConflict(combinedList)", resolveConflict(
                    combinedList)

                abunAnalysisList.append([
                    inList, outList, resolvedList, brResolvedList,
                    resolveConflict(combinedList)
                ])
                if len(inList) <= maxRThres and len(
                        outList) <= maxRThres and len(inList) > 0 and len(
                            outList) > 0:
                    resolvedCombine = resolveConflict(combinedList)
                    Gnew.bipartiteLocalResolve(resolvedCombine, inList,
                                               outList, folderName)
            else:
                import emalgo
                resolvedCombine = emalgo.BResolvePreparation(
                    folderName, inList, outList, G, Grev, N1, mummerLink)
                Gnew.bipartiteLocalResolve(resolvedCombine, inList, outList,
                                           folderName)

        Gnew.condense()

        with open(folderName + "biResolvedCombineList.json", 'w') as f:
            json.dump(biResolvedCombineList, f)

        with open(folderName + "abunAnalysisList.json", 'w') as f:
            json.dump(abunAnalysisList, f)

        #assert(1==2)

        return Gnew

    else:
        return Gnew