def adjListToRepeatList(newAdjacencyList, folderName, repeatFilename): N1 = len(newAdjacencyList) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def adjListToRepeatList(newAdjacencyList,folderName,repeatFilename): N1 = len(newAdjacencyList) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = abunGraphLib.seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraph(G) json_data = open(folderName + repeatFilename, 'r') repeatList = json.load(json_data) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)", len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1): print rIn, rOut if (len(rIn) == 1 and len(rOut) == 1): rIn = [rIn[0], rIn[0]] rOut = [rOut[0], rOut[0]] # 1. Records reachable indices kkIn , kkOut = [], [] for eachkk in rIn: kkIn.append(str(eachkk)+"_"+"in") for eachkk in rOut: kkOut.append(str(eachkk)+"_"+"out") abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1) # 2. Marks inside nodes singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway #checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList) # ## Experimental repeatList = allPassList # ## End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + repeatSpec, 'w') as outfile: json.dump(bigDumpList, outfile)
def BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic, mummerLink): if abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve: print "abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve", abunHouseKeeper.abunGlobalSplitParameterRobot.runBResolve maxRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.RThres repeatFinder.adjListToRepeatList(Gnew.adj, folderName, "phaseRepeatTR.txt") json_data = open(folderName + "phaseRepeatTR.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] G = abunGraphLib.seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraphFast(G) abunAnalysisList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] if not abunHouseKeeper.abunGlobalRunEM: resolvedList, brResolvedList = [], [] if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAbunB: if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLower = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunLowerB if abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpper = abunHouseKeeper.abunGlobalSplitParameterRobot.AbunUpperB if not abunHouseKeeper.abunGlobalSplitParameterRobot.toRunAggB: resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) else: resolvedList = determindMatchAggregate( inList, outList, myCountDic, folderName, contigReadGraph, N1, Gnew, lenDic) if abunHouseKeeper.abunGlobalSplitParameterRobot.toRunBRB: if abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB > 0: abunHouseKeeper.abunGlobalSplitParameterRobot.BRThres = abunHouseKeeper.abunGlobalSplitParameterRobot.BRThresB brResolvedList = formBRReolve(folderName, inList, outList, G, Grev, True, N1) combinedList = abunHouseKeeper.getDistinct(resolvedList + brResolvedList) print "resolvedList, brResolvedList, inList, outList", resolvedList, brResolvedList, inList, outList print "resolveConflict(combinedList)", resolveConflict( combinedList) abunAnalysisList.append([ inList, outList, resolvedList, brResolvedList, resolveConflict(combinedList) ]) if len(inList) <= maxRThres and len( outList) <= maxRThres and len(inList) > 0 and len( outList) > 0: resolvedCombine = resolveConflict(combinedList) Gnew.bipartiteLocalResolve(resolvedCombine, inList, outList, folderName) else: import emalgo resolvedCombine = emalgo.BResolvePreparation( folderName, inList, outList, G, Grev, N1, mummerLink) Gnew.bipartiteLocalResolve(resolvedCombine, inList, outList, folderName) Gnew.condense() with open(folderName + "biResolvedCombineList.json", 'w') as f: json.dump(biResolvedCombineList, f) with open(folderName + "abunAnalysisList.json", 'w') as f: json.dump(abunAnalysisList, f) #assert(1==2) return Gnew else: return Gnew