def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1): #print "formConfirmReadResolve" resolvedList = [] confirmingReadList = [] brLFlankList = [] brRFlankList = [] ### Find possible candidate reads print "inList , outList formConfirmReadResolve()", inList, outList for eachin in inList: for eachout in outList: pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3) for path in pathList: if len(path) == 3 and path[1] >= N1: R = path[1] confirmingReadList.append(R) brLFlankList.append([eachin, R]) brRFlankList.append([eachout, R]) ### Filter simple false cases toUseReadDic = {} confirmingReadList.sort() for key, items in groupby(confirmingReadList): toUseReadDic[str(key)] = True newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList) newbrLFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrLFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList) newbrRFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrRFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False finalSearchReadList = [] for eachitem in toUseReadDic: if toUseReadDic[eachitem] == True: finalSearchReadList.append(int(eachitem)) ### Check paths to confirm all false cases for eachR in finalSearchReadList: l1 = abunGraphLib.findAllReachable(eachR, N1, G) l2 = abunGraphLib.findAllReachable(eachR, N1, Grev) l1Distinct = abunHouseKeeper.getDistinct(l1) l2Distinct = abunHouseKeeper.getDistinct(l2) if len(l1Distinct) == 1 and len(l2Distinct) == 1: c1, c2 = l1Distinct[0], l2Distinct[0] resolvedList.append([c2, c1]) return resolvedList
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction( folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G ) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList ), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", "w") as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", "w") as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", "r") gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", "a") for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic )
def xNodeResolving(folderName, contigReadGraph): ### Init G, myCountDic, N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) with open(folderName + "myCountDic.json") as f: myCountDic = json.load(f) N1 = len(myCountDic) * 2 ### Add resolved edge adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) extraCounter = 0 mapDummyToRealDic = {} resolvedList = [] for v in Gnew.graphNodesList: inList = [] for eachitem in v.listOfPrevNodes: inList.append(eachitem[0]) outList = [] for eachitem in v.listOfNextNodes: outList.append(eachitem[0]) inListCt = getCtTwoToOne(inList, myCountDic) outListCt = getCtTwoToOne(outList, myCountDic) sizeList = [] for eachitem in myCountDic: sizeList.append(myCountDic[eachitem]) sd = np.std(sizeList) for eachIn in inListCt: matchedOut = satisfyMatch(eachIn, outListCt, sd) if matchedOut != -1: leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex inSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1 ) leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut outSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1 ) if inSuccReadsList != None and outSuccReadsList != None: resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter]) print "in: ", resolvedList[-1] resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut]) print "out: ", resolvedList[-1] mapDummyToRealDic[extraCounter] = v.nodeIndex extraCounter = extraCounter + 1 return resolvedList, mapDummyToRealDic
def formConfirmReadResolve(folderName, inList, outList, G, Grev, N1): # print "formConfirmReadResolve" resolvedList = [] confirmingReadList = [] brLFlankList = [] brRFlankList = [] ### Find possible candidate reads print "inList , outList formConfirmReadResolve()", inList, outList for eachin in inList: for eachout in outList: pathList = abunGraphLib.findAllPathK(eachin, eachout, G, 3) for path in pathList: if len(path) == 3 and path[1] >= N1: R = path[1] confirmingReadList.append(R) brLFlankList.append([eachin, R]) brRFlankList.append([eachout, R]) ### Filter simple false cases toUseReadDic = {} confirmingReadList.sort() for key, items in groupby(confirmingReadList): toUseReadDic[str(key)] = True newbrLFlankList = abunHouseKeeper.getDistinct(brLFlankList) newbrLFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrLFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False newbrRFlankList = abunHouseKeeper.getDistinct(brRFlankList) newbrRFlankList.sort(key=itemgetter(1)) for key, items in groupby(newbrRFlankList, itemgetter(1)): mylist = list(items) if len(mylist) > 1: toUseReadDic[str(key)] = False finalSearchReadList = [] for eachitem in toUseReadDic: if toUseReadDic[eachitem] == True: finalSearchReadList.append(int(eachitem)) ### Check paths to confirm all false cases for eachR in finalSearchReadList: l1 = abunGraphLib.findAllReachable(eachR, N1, G) l2 = abunGraphLib.findAllReachable(eachR, N1, Grev) l1Distinct = abunHouseKeeper.getDistinct(l1) l2Distinct = abunHouseKeeper.getDistinct(l2) if len(l1Distinct) == 1 and len(l2Distinct) == 1: c1, c2 = l1Distinct[0], l2Distinct[0] resolvedList.append([c2, c1]) return resolvedList
def continuousIntegration(): if False: G = graphLib.seqGraph(10) for i in range(5): G.insertEdge(i,i+1,1997) G.insertEdge(i,i+2, 1997) resultList = abunGraphLib.BFS_revisit(1,3,G,1) print "resultList", resultList if False : folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \ "Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta" abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile) if False: lenDic = IORobot.obtainLength(folderName , contigFile) N1 = len(lenDic) print "N1", N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) Gnew.initAdv() Gnew.doubleEdgeReduction() contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3) contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5) print "contigPaths", contigPaths print "contigReadPaths", contigReadPaths Gnew.transitiveReduction() if False: toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/") print toDelete if False: G = graphLib.seqGraph(0) G.loadFromFile("Apr10TestA/", "xResolvedGraph") if False: for i in range(len(G.graphNodesList)): v = G.graphNodesList[i] if len(v.nodeIndexList) > 0: print i , v.listOfPrevNodes , v.listOfNextNodes G.reportEdge() lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta") mylist = [401, 207, 405, 407, 344] json_data = open("Apr10TestA/" + "myCountDic.json", 'r') myCountDic = json.load(json_data) for x in mylist: print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)] if False: folderName = "Apr10TestA/" G = graphLib.seqGraph(0) G.loadFromFile(folderName , "xResolvedGraph") json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") print len(G.graphNodesList) print len(mapDummyToRealDic) print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic) if False: abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/") if False: nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/") if False: folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1" G = graphLib.seqGraph(0) kthres, edgeThres = 3, 1 G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta") N1 = len(lenDic) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres: adj[i].append(j) #print i, adj[i] ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i+1) adaptorPair.append([i, i+1]) elif i % 2 ==1: if i-1 in adj[i] : adj[i].remove(i-1) adaptorPair.append([i, i-1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) #Gnew.reportEdge() count2 = 0 for i in range(len(Gnew.graphNodesList)): if len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and len(Gnew.graphNodesList[i].listOfNextNodes) == 2: count2 = count2 + 1 print str(i)+"{color:red}" print "count2, ", count2 ### End filter adaptor skipped case if True: nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename): ### Transitive reduction and remove double pointers N1 = len(myCountDic) * 2 print "N1", N1 kthres = abunHouseKeeper.abunGlobalSplitParameterRobot.kthres edgeThres = abunHouseKeeper.abunGlobalSplitParameterRobot.edgeThres G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i, j, G, kthres)) >= edgeThres: adj[i].append(j) ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i + 1) adaptorPair.append([i, i + 1]) elif i % 2 == 1: if i - 1 in adj[i]: adj[i].remove(i - 1) adaptorPair.append([i, i - 1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) Gnew.reportEdge() ### Trying out the new component import toCondenseFixer Gnew = toCondenseFixer.noGoZoneDefiner(Gnew, folderName) Gnew.symGraph() #Gnew.reportEdge() ### End filter adaptor skipped case if abunHouseKeeper.abunGlobalSplitParameterRobot.runGraphSurgery: Gnew.initAdv() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunCondenseRemove: Gnew.condenseEdgeRemove(G, folderName, mummerLink, contigFilename) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunDoubltPtr: Gnew.doubleEdgeReduction() if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunTransitive: Gnew.transitiveReduction(folderName, mummerLink, contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta", G) Gnew.condense() Gnew.findAdjList() else: Gnew.initAdv() Gnew.condense() Gnew.findAdjList() return Gnew
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving( folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", 'w') as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", 'w') as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", 'w') as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", 'r') resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", 'r') gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile( folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", 'a') for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic)
def xNodeResolving(folderName, contigReadGraph): ''' Input : contigGraph , abunInfo , folderName Output: myresolvedList.json, gapContentLookUp.json, dummyNodeMapping.json Algorithm : 1) Tranverse the graph a) If the node can well be fixed with sd requirement met i) Link it across and add the pair into the myresolvedList, gapContentLookUp ii) Add dummynodes and fill in the dummyNodeMapping 2) Format return and output as temp file ''' ### Init G, myCountDic, N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) with open(folderName + 'myCountDic.json') as f: myCountDic = json.load(f) N1 = len(myCountDic) * 2 ### Add resolved edge adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) extraCounter = 0 mapDummyToRealDic = {} resolvedList = [] for v in Gnew.graphNodesList: inList = [] for eachitem in v.listOfPrevNodes: inList.append(eachitem[0]) outList = [] for eachitem in v.listOfNextNodes: outList.append(eachitem[0]) inListCt = getCtTwoToOne(inList, myCountDic) outListCt = getCtTwoToOne(outList, myCountDic) sizeList = [] for eachitem in myCountDic: sizeList.append(myCountDic[eachitem]) sd = np.std(sizeList) for eachIn in inListCt: matchedOut = satisfyMatch(eachIn, outListCt, sd) if matchedOut != -1: leftCtgIndex, rightCtgIndex = eachIn[0], v.nodeIndex inSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) leftCtgIndex, rightCtgIndex = v.nodeIndex, matchedOut outSuccReadsList = abunGraphLib.findPathBtwEnds( folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) if inSuccReadsList != None and outSuccReadsList != None: resolvedList.append([eachIn[0]] + inSuccReadsList + [N1 + extraCounter]) print "in: ", resolvedList[-1] resolvedList.append([N1 + extraCounter] + outSuccReadsList + [matchedOut]) print "out: ", resolvedList[-1] mapDummyToRealDic[extraCounter] = v.nodeIndex extraCounter = extraCounter + 1 return resolvedList, mapDummyToRealDic