コード例 #1
0
def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1):
    #print "formConfirmReadResolve"

    resolvedList = []
    confirmingReadList = []
    brLFlankList = []
    brRFlankList = []

    ### Find possible candidate reads
    print "inList , outList formConfirmReadResolve()", inList, outList
    for eachin in inList:
        for eachout in outList:
            pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3)
            for path in pathList:
                if len(path) == 3 and path[1] >= N1:
                    R = path[1]
                    confirmingReadList.append(R)
                    brLFlankList.append([eachin, R])
                    brRFlankList.append([eachout, R])

    ### Filter simple false cases
    toUseReadDic = {}
    confirmingReadList.sort()
    for key, items in groupby(confirmingReadList):
        toUseReadDic[str(key)] = True

    newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList)
    newbrLFlankList.sort(key=itemgetter(1))

    for key, items in groupby(newbrLFlankList, itemgetter(1)):
        mylist = list(items)
        if len(mylist) > 1:
            toUseReadDic[str(key)] = False

    newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList)
    newbrRFlankList.sort(key=itemgetter(1))

    for key, items in groupby(newbrRFlankList, itemgetter(1)):
        mylist = list(items)
        if len(mylist) > 1:
            toUseReadDic[str(key)] = False

    finalSearchReadList = []
    for eachitem in toUseReadDic:
        if toUseReadDic[eachitem] == True:
            finalSearchReadList.append(int(eachitem))

    ### Check paths to confirm all false cases
    for eachR in finalSearchReadList:
        l1 = abunGraphLib.findAllReachable(eachR, N1, G)
        l2 = abunGraphLib.findAllReachable(eachR, N1, Grev)

        l1Distinct = abunHouseKeeper.getDistinct(l1)
        l2Distinct = abunHouseKeeper.getDistinct(l2)

        if len(l1Distinct) == 1 and len(l2Distinct) == 1:
            c1, c2 = l1Distinct[0], l2Distinct[0]
            resolvedList.append([c2, c1])

    return resolvedList
コード例 #2
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    # cut here

    adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
コード例 #3
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    

    # cut here

    adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
コード例 #4
0
ファイル: repeatFinder.py プロジェクト: Imoteph/finishingTool
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
    
コード例 #5
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
コード例 #6
0
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename):

    ### Transitive reduction and remove double pointers
    N1 = len(myCountDic) * 2
    print "N1", N1
    kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres
    edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        tmpList = abunGraphLib.findAllReachable(i, N1, G)

        for j in tmpList:
            if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres:
                adj[i].append(j)

    ### Filter adaptor skipped case

    adaptorPair = []

    for i in range(len(adj)):
        if i % 2 == 0:
            if i + 1 in adj[i]:
                adj[i].remove(i + 1)
                adaptorPair.append([i, i + 1])
        elif i % 2 == 1:
            if i - 1 in adj[i]:
                adj[i].remove(i - 1)
                adaptorPair.append([i, i - 1])

    Gnew = abunGraphLib.seqGraphDynamic(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1997)

    for eachpair in adaptorPair:
        u, v = eachpair[0], eachpair[1]
        for x in Gnew.graphNodesList[u].listOfPrevNodes:
            xIndex = x[0]
            Gnew.removeEdge(xIndex, v)
        for y in Gnew.graphNodesList[v].listOfNextNodes:
            yIndex = y[0]
            Gnew.removeEdge(u, yIndex)

    ### Trying out the new component
    import toCondenseFixer

    Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName)

    Gnew.symGraph()
    ### End filter adaptor skipped case

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery:

        Gnew.initAdv()
        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove:
            Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename)

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr:
            Gnew.doubleEdgeReduction()

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive:
            Gnew.transitiveReduction(
                folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G
            )

        Gnew.condense()
        Gnew.findAdjList()
    else:
        Gnew.initAdv()
        Gnew.condense()
        Gnew.findAdjList()

    return Gnew
コード例 #7
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", "r")
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList
        ), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", "w") as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
        )
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", "w") as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", "r")
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", "a")
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(
            G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
        )
コード例 #8
0
def xNodeResolving(folderName, contigReadGraph):

    ### Init G, myCountDic, N1
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    with open(folderName + "myCountDic.json") as f:
        myCountDic = json.load(f)

    N1 = len(myCountDic) * 2

    ### Add resolved edge

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    extraCounter = 0
    mapDummyToRealDic = {}
    resolvedList = []

    for v in Gnew.graphNodesList:

        inList = []
        for eachitem in v.listOfPrevNodes:
            inList.append(eachitem[0])

        outList = []
        for eachitem in v.listOfNextNodes:
            outList.append(eachitem[0])

        inListCt = getCtTwoToOne(inList, myCountDic)
        outListCt = getCtTwoToOne(outList, myCountDic)

        sizeList = []
        for eachitem in myCountDic:
            sizeList.append(myCountDic[eachitem])

        sd = np.std(sizeList)

        for eachIn in inListCt:
            matchedOut = satisfyMatch(eachIn, outListCt, sd)

            if matchedOut != -1:
                leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex
                inSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1
                )

                leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut
                outSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1
                )

                if inSuccReadsList != None and outSuccReadsList != None:

                    resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter])
                    print "in: ", resolvedList[-1]

                    resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut])
                    print "out: ", resolvedList[-1]

                    mapDummyToRealDic[extraCounter] = v.nodeIndex
                    extraCounter = extraCounter + 1

    return resolvedList, mapDummyToRealDic
コード例 #9
0
def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1):
    # print "formConfirmReadResolve"

    resolvedList = []
    confirmingReadList = []
    brLFlankList = []
    brRFlankList = []

    ### Find possible candidate reads
    print "inList , outList formConfirmReadResolve()", inList, outList
    for eachin in inList:
        for eachout in outList:
            pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3)

            for path in pathList:

                if len(path) == 3 and path[1] >= N1:
                    R = path[1]
                    confirmingReadList.append(R)
                    brLFlankList.append([eachin, R])
                    brRFlankList.append([eachout, R])

    ### Filter simple false cases
    toUseReadDic = {}
    confirmingReadList.sort()
    for key, items in groupby(confirmingReadList):
        toUseReadDic[str(key)] = True

    newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList)
    newbrLFlankList.sort(key=itemgetter(1))

    for key, items in groupby(newbrLFlankList, itemgetter(1)):
        mylist = list(items)
        if len(mylist) > 1:
            toUseReadDic[str(key)] = False

    newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList)
    newbrRFlankList.sort(key=itemgetter(1))

    for key, items in groupby(newbrRFlankList, itemgetter(1)):
        mylist = list(items)
        if len(mylist) > 1:
            toUseReadDic[str(key)] = False

    finalSearchReadList = []
    for eachitem in toUseReadDic:
        if toUseReadDic[eachitem] == True:
            finalSearchReadList.append(int(eachitem))

    ### Check paths to confirm all false cases
    for eachR in finalSearchReadList:
        l1 = abunGraphLib.findAllReachable(eachR, N1, G)
        l2 = abunGraphLib.findAllReachable(eachR, N1, Grev)

        l1Distinct = abunHouseKeeper.getDistinct(l1)
        l2Distinct = abunHouseKeeper.getDistinct(l2)

        if len(l1Distinct) == 1 and len(l2Distinct) == 1:
            c1, c2 = l1Distinct[0], l2Distinct[0]
            resolvedList.append([c2, c1])

    return resolvedList
コード例 #10
0
def continuousIntegration():
	if False:
		G = graphLib.seqGraph(10)
		for i in range(5):
			G.insertEdge(i,i+1,1997)
			G.insertEdge(i,i+2, 1997)

		resultList = abunGraphLib.BFS_revisit(1,3,G,1)

		print "resultList", resultList 

	if False : 

		folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \
			"Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta"

		abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile)
    
		if False:
			lenDic = IORobot.obtainLength(folderName , contigFile)
			N1 = len(lenDic)

			print "N1", N1

			G = graphLib.seqGraph(0)
			G.loadFromFile(folderName, "phaseStringGraph1")

			adj = [[] for i in range(N1)]

			for i in range(N1): 
			    adj[i] = abunGraphLib.findAllReachable(i, N1, G)

			Gnew = abunGraphLib.seqGraphDynamic(N1)

			for i in range(N1):
			    for j in adj[i]:
			        Gnew.insertEdge(i,j,1997)


			Gnew.initAdv()    
			Gnew.doubleEdgeReduction()

			contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3)
			contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5)

			print "contigPaths", contigPaths
			print "contigReadPaths", contigReadPaths

			Gnew.transitiveReduction()

	if False:
		toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/")
		print toDelete

	if False:
		G = graphLib.seqGraph(0)
		G.loadFromFile("Apr10TestA/", "xResolvedGraph")

		if False:
			for i in range(len(G.graphNodesList)):

				v = G.graphNodesList[i]

				if len(v.nodeIndexList) > 0:
					print i , v.listOfPrevNodes , v.listOfNextNodes

		G.reportEdge()
		lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta")
		mylist = [401, 207, 405, 407, 344]

		json_data = open("Apr10TestA/" + "myCountDic.json", 'r')
		myCountDic = json.load(json_data)

		for x in mylist:
			print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)]


	if False:
		folderName = "Apr10TestA/"
		G = graphLib.seqGraph(0)
		G.loadFromFile(folderName , "xResolvedGraph")

		json_data = open(folderName + "mapDummyToRealDic.json", 'r')
		mapDummyToRealDic = json.load(json_data)

		lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
		print len(G.graphNodesList)
		print len(mapDummyToRealDic)
		
		print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic)


	if False:
		abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/")

	if False: 
		nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/")

	if False:
		folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1"
		G = graphLib.seqGraph(0)
		kthres, edgeThres = 3, 1
		G.loadFromFile(folderName, contigReadGraph)
		lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta")

		N1 = len(lenDic)

		adj = [[] for i in range(N1)]

		for i in range(N1): 
		    tmpList = abunGraphLib.findAllReachable(i, N1, G)
		    
		    for j in tmpList:
		        if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres:
		            adj[i].append(j) 

		    #print i, adj[i]

	    ### Filter adaptor skipped case 

		adaptorPair = []

		for i in range(len(adj)):
		    if  i % 2 == 0:
		        if i + 1 in adj[i]:
		            adj[i].remove(i+1)
		            adaptorPair.append([i, i+1])
		    elif i % 2 ==1: 
		        if i-1 in adj[i] :
		            adj[i].remove(i-1)
		            adaptorPair.append([i, i-1])

		Gnew = abunGraphLib.seqGraphDynamic(N1)

		for i in range(N1):
		    for j in adj[i]:
		        Gnew.insertEdge(i,j,1997)

		for eachpair in adaptorPair:
		    u, v = eachpair[0], eachpair[1]
		    for x in Gnew.graphNodesList[u].listOfPrevNodes:
		        xIndex = x[0]
		        Gnew.removeEdge(xIndex, v)
		    for y in Gnew.graphNodesList[v].listOfNextNodes:
		        yIndex = y[0]
		        Gnew.removeEdge(u, yIndex)


        #Gnew.reportEdge()
		count2 = 0
		for i in range(len(Gnew.graphNodesList)):
			if  len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and  len(Gnew.graphNodesList[i].listOfNextNodes) == 2:
				count2 = count2 + 1
				print str(i)+"{color:red}"

		print "count2, ", count2

		### End filter adaptor skipped case 
	if True:
		nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
コード例 #11
0
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink,
                 readsetFilename, contigFilename):

    ### Transitive reduction and remove double pointers
    N1 = len(myCountDic) * 2
    print "N1", N1
    kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres
    edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        tmpList = abunGraphLib.findAllReachable(i, N1, G)

        for j in tmpList:
            if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres:
                adj[i].append(j)

    ### Filter adaptor skipped case

    adaptorPair = []

    for i in range(len(adj)):
        if i % 2 == 0:
            if i + 1 in adj[i]:
                adj[i].remove(i + 1)
                adaptorPair.append([i, i + 1])
        elif i % 2 == 1:
            if i - 1 in adj[i]:
                adj[i].remove(i - 1)
                adaptorPair.append([i, i - 1])

    Gnew = abunGraphLib.seqGraphDynamic(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1997)

    for eachpair in adaptorPair:
        u, v = eachpair[0], eachpair[1]
        for x in Gnew.graphNodesList[u].listOfPrevNodes:
            xIndex = x[0]
            Gnew.removeEdge(xIndex, v)
        for y in Gnew.graphNodesList[v].listOfNextNodes:
            yIndex = y[0]
            Gnew.removeEdge(u, yIndex)

    Gnew.reportEdge()
    ### Trying out the new component
    import toCondenseFixer
    Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName)

    Gnew.symGraph()
    #Gnew.reportEdge()
    ### End filter adaptor skipped case

    if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery:

        Gnew.initAdv()
        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove:
            Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename)

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr:
            Gnew.doubleEdgeReduction()

        if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive:
            Gnew.transitiveReduction(folderName, mummerLink,
                                     contigFilename + "_Double.fasta",
                                     readsetFilename + "_Double.fasta", G)

        Gnew.condense()
        Gnew.findAdjList()
    else:
        Gnew.initAdv()
        Gnew.condense()
        Gnew.findAdjList()

    return Gnew
コード例 #12
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph,
                          contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", 'r')
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic,
                                          folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(
            folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", 'w') as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", 'w') as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename, mapDummyToRealDic)
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
                eachitem[2], eachitem[3], eachitem[4]
            ]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", 'w') as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", 'r')
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", 'r')
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", 'r')
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " +
                  folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(
            folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", 'a')
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                      "tmpWithDummy.fasta",
                                      gapContentLookUpDic, mapDummyToRealDic)
コード例 #13
0
def xNodeResolving(folderName, contigReadGraph):
    '''
    Input : contigGraph , abunInfo , folderName  

    Output: myresolvedList.json, gapContentLookUp.json, dummyNodeMapping.json

    Algorithm :
        1) Tranverse the graph 
            a) If the node can well be fixed with sd requirement met 
                i) Link it across and add the pair into the myresolvedList, gapContentLookUp
                ii) Add dummynodes and fill in the dummyNodeMapping 
        
        2) Format return and output as temp file 
    '''

    ### Init G, myCountDic, N1
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    with open(folderName + 'myCountDic.json') as f:
        myCountDic = json.load(f)

    N1 = len(myCountDic) * 2

    ### Add resolved edge

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    extraCounter = 0
    mapDummyToRealDic = {}
    resolvedList = []

    for v in Gnew.graphNodesList:

        inList = []
        for eachitem in v.listOfPrevNodes:
            inList.append(eachitem[0])

        outList = []
        for eachitem in v.listOfNextNodes:
            outList.append(eachitem[0])

        inListCt = getCtTwoToOne(inList, myCountDic)
        outListCt = getCtTwoToOne(outList, myCountDic)

        sizeList = []
        for eachitem in myCountDic:
            sizeList.append(myCountDic[eachitem])

        sd = np.std(sizeList)

        for eachIn in inListCt:
            matchedOut = satisfyMatch(eachIn, outListCt, sd)

            if matchedOut != -1:
                leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex
                inSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph,
                    N1)

                leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut
                outSuccReadsList = abunGraphLib.findPathBtwEnds(
                    folderName, leftCtgIndex, rightCtgIndex, contigReadGraph,
                    N1)

                if inSuccReadsList != None and outSuccReadsList != None:

                    resolvedList.append([eachIn[0]] + inSuccReadsList +
                                        [N1 + extraCounter])
                    print "in: ", resolvedList[-1]

                    resolvedList.append([N1 + extraCounter] +
                                        outSuccReadsList + [matchedOut])
                    print "out: ", resolvedList[-1]

                    mapDummyToRealDic[extraCounter] = v.nodeIndex
                    extraCounter = extraCounter + 1

    return resolvedList, mapDummyToRealDic