def formSeqGraph(folderName, mummerLink): print "formSeqGraph" startList, graphNodes = [], [] print "formSeqGraph: Reading in best successors and predecessors" rightConnect = readConnectList(folderName, "rightConnect.txt") leftConnect = readConnectList(folderName, "leftConnect.txt") numberOfNodes = len(rightConnect) print "formSeqGraph: Initializing seqGraph" G = graphLib.seqGraph(numberOfNodes) print "formSeqGraph: Adding edges to seqGraph" for eachitem, i in zip(rightConnect, range(len(rightConnect))): index = i connector, weight = eachitem G.insertEdge(index, connector, weight) for eachitem, i in zip(leftConnect, range(len(leftConnect))): index = i connector, weight = eachitem G.insertEdge(connector, index, weight) G.cleanEdge() G.condense() print "formSeqGraph: Outputting seqGraph to condensedGraph.txt" G.saveToFile(folderName, "condensedGraph.txt") G.checkSelfLoops() G.checkCompleteness() G2 = graphLib.seqGraph(0) G2.loadFromFile(folderName, "condensedGraph.txt") houseKeeper.compareGraphUnitTest(G, G2) G.reportDummyUsefulNode() G.reportEdge() graphFileName = "condensedGraph.txt" contigFile = "noEmbed_Double.fasta" outContigFile = "improved.fasta" outOpenList = "openZone.txt" print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
def readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList): print "readContigOut" G = graphLib.seqGraph(0) G.loadFromFile(folderName, graphFileName) G.findStartEndList() myContigsDic = loadContigsFromFile(folderName, contigFile) contigUsed = [False for i in range(len(G.graphNodesList) / 2)] seqToPrint = [] openList = [] noForRevMismatch = True for eachnode in G.graphNodesList: if len(eachnode.nodeIndexList) > 0: tmpSeq = "" # ## debug consistency of t/f ckList = [] for dummy in eachnode.nodeIndexList: indexToAdd = dummy readNum = indexToAdd / 2 ckList.append(contigUsed[readNum]) if (len(ckList) > 0 and not all(ckList) and any(ckList)): noForRevMismatch = False # ## end debug for i in range(len(eachnode.nodeIndexList)): indexToAdd = eachnode.nodeIndexList[i] readNum = indexToAdd / 2 orientation = indexToAdd % 2 if contigUsed[readNum] == False: if i != len(eachnode.nodeIndexList) - 1: overlapLen = eachnode.overlapList[i] if orientation == 0: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'p'][0:-overlapLen] else: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'd'][0:-overlapLen] else: if orientation == 0: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'p'] else: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'd'] contigUsed[readNum] = True if len(tmpSeq) > 0: if eachnode.nodeIndex in G.myStartList: openList.append('Segkk' + str(len(seqToPrint)) + ',noprev') if eachnode.nodeIndex in G.myEndList: openList.append('Segkk' + str(len(seqToPrint)) + ',nonext') # ## Debug if eachnode.nodeIndex == 444: print 439, len(seqToPrint) if eachnode.nodeIndex == 67: print 67, len(seqToPrint) # ## End Debug seqToPrint.append(tmpSeq) print "No forward/reverse mismatch ?", noForRevMismatch fImproved = open(folderName + outContigFile, 'w') for eachcontig, dummyIndex in zip(seqToPrint, range(len(seqToPrint))): fImproved.write(">Segkk" + str(dummyIndex) + '\n') fImproved.write(eachcontig + '\n') fImproved.close() print "All contigs used? ", all(contigUsed) print "NContig", len(seqToPrint) f = open(folderName + outOpenList, 'w') f.write(str(len(seqToPrint)) + '\n') for eachitem in openList: f.write(str(eachitem) + str('\n')) f.close()
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 print "xPhased: Building seqGraph" G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt" G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList, nameDic={}): print "readContigOut" G = graphLib.seqGraph(0) G.loadFromFile(folderName, graphFileName) G.findStartEndList() myContigsDic = loadContigsFromFile(folderName, contigFile) contigUsed = [False for i in range(len(G.graphNodesList) / 2)] seqToPrint = [] openList = [] noForRevMismatch = True print len(G.graphNodesList) for eachnode in G.graphNodesList: print eachnode.nodeIndexList if len(eachnode.nodeIndexList) > 0: tmpSeq = "" # ## debug consistency of t/f ckList = [] for dummy in eachnode.nodeIndexList: indexToAdd = dummy readNum = indexToAdd / 2 ckList.append(contigUsed[readNum]) if (len(ckList) > 0 and not all(ckList) and any(ckList)): noForRevMismatch = False # ## end debug if contigUsed[eachnode.nodeIndexList[0] / 2] == False: contigUsed[eachnode.nodeIndexList[0] / 2] = True contigUsed[eachnode.nodeIndexList[-1] / 2] = True for i in range(len(eachnode.nodeIndexList)): indexToAdd = eachnode.nodeIndexList[i] readNum = indexToAdd / 2 orientation = indexToAdd % 2 #print nameDic[indexToAdd] if len(nameDic) > 0: orientation = nameDic[indexToAdd] % 2 readNum = nameDic[indexToAdd] / 2 #print readNum if i != len(eachnode.nodeIndexList) - 1: overlapLenOld = eachnode.overlapList[i] # Can we hijack here for the overlap Length... seems like minimal changes overlapLen = useAlignToGetLen(eachnode, i, nameDic, orientation, myContigsDic, readNum, folderName, mummerLink) # End Hijacking print overlapLen, overlapLenOld if orientation == 0: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'p'][0:-overlapLen] else: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'd'][0:-overlapLen] else: if orientation == 0: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'p'] else: tmpSeq = tmpSeq + myContigsDic['Contig' + str(readNum) + '_' + 'd'] if len(tmpSeq) > 0: if eachnode.nodeIndex in G.myStartList: openList.append('Segkk' + str(len(seqToPrint)) + ',noprev') if eachnode.nodeIndex in G.myEndList: openList.append('Segkk' + str(len(seqToPrint)) + ',nonext') seqToPrint.append(tmpSeq) print "No forward/reverse mismatch ?", noForRevMismatch fImproved = open(folderName + outContigFile, 'w') for eachcontig, dummyIndex in zip(seqToPrint, range(len(seqToPrint))): print len(eachcontig) fImproved.write(">Segkk" + str(dummyIndex) + '\n') fImproved.write(eachcontig + '\n') fImproved.close() print "All contigs used? ", all(contigUsed) print "NContig", len(seqToPrint) f = open(folderName + outOpenList, 'w') f.write(str(len(seqToPrint)) + '\n') for eachitem in openList: f.write(str(eachitem) + str('\n')) f.close()