def logEdges(self, folderName, stagename): print "Logging edges" logList = [] if stagename == "XResolution": mapDummyToRealDic =self.readInJSON(folderName, "mapDummyToRealDic.json") else: mapDummyToRealDic = {} for eachnode in self.graphNodesList: tmpNodeIndexList = [] for kk in eachnode.nodeIndexList: if kk >= self.N1: tmpNodeIndexList += mapDummyToRealDic[str(kk-self.N1)][1] else: tmpNodeIndexList += [kk] if len(tmpNodeIndexList) >= 2: for i in range(len(tmpNodeIndexList)-1): currentName = tmpNodeIndexList[i] nextName = tmpNodeIndexList[i+1] cName = abunHouseKeeper.parseIDToName(currentName,'C',0) nName = abunHouseKeeper.parseIDToName(nextName,'C',0) logList.append([cName, nName]) with open( folderName + stagename + ".json", 'w') as f: json.dump(logList, f)
def getCtAgg(inList, myCountDic, Gnew, lenDic): newInList = [] print "New getCtAgg", len(inList) for i in inList: tmp1 = convert4to1base(i) tmp2 = convert4to2base(i) covTmp = 0 lenTmp = 0 print "len(Gnew.graphNodesList[tmp2].nodeIndexList), len(inList)", len( Gnew.graphNodesList[tmp2].nodeIndexList), len(inList) for eachindex in Gnew.graphNodesList[tmp2].nodeIndexList: if lenDic[abunHouseKeeper.parseIDToName(eachindex, 'C', len(lenDic))] > lenTmp: lenTmp = lenDic[abunHouseKeeper.parseIDToName( eachindex, 'C', len(lenDic))] name = "Segkk" + str(eachindex / 2) covTmp = myCountDic[name] newInList.append([tmp2, covTmp]) return newInList
def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename): print "condenseEdgeRemove" thresPass = 100 thresForStrangeCut = 5000 ### kkdebug toRemoveList = [] for eachnode in self.graphNodesList: if len(eachnode.nodeIndexList) > 0: if len(eachnode.listOfNextNodes) ==1 : nextNodeIndex = eachnode.listOfNextNodes[0][0] nextNode= self.graphNodesList[nextNodeIndex] if len(nextNode.listOfPrevNodes) == 1 : currentName = eachnode.nodeIndex nextName = nextNode.nodeIndex contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5) cName = abunHouseKeeper.parseIDToName(currentName,'C',0) nName = abunHouseKeeper.parseIDToName(nextName,'C',0) noGoNext = self.readInJSON(folderName, "noGoNext.json") noGoPrev = self.readInJSON(folderName, "noGoPrev.json") overlap = [-1, -1] ctr = 0 for eachpath in contigReadPaths: if len(eachpath) > 2: ctr = ctr + 1 elif len(eachpath) == 2: contigName = cName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = nName rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) if ctr <= thresPass and (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ): self.removeEdge(currentName, nextName) toRemoveList.append([currentName, nextName]) ### kkdebug #with open( "dataFolder/toRemoveList.json", 'w') as f: # json.dump(toRemoveList, f) self.findAdjList()
def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta") lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName, sortedContigList, sortedContigDic, lenDicContig, lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig, lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def findAttachedReads(x, side, folderName, sortedContigList, sortedContigDic, lenDicContig, lenDicRead): rList = [] ''' Format : [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [ IDY] | [TAGS] ===================================================================================== 1 562 | 819 1418 | 562 600 | 84.72 | Contig0_d Read121_d 1 562 | 4077 3478 | 562 600 | 84.72 | Contig0_d Read121_p 1 564 | 656 68 | 564 589 | 90.13 | Contig0_d Read382_d 1 564 | 6996 7584 | 564 589 | 90.13 | Contig0_d Read382_p 1 571 | 1386 815 | 571 572 | 86.60 | Contig0_d Read421_d ''' thres = thresMiddleContig key = abunHouseKeeper.parseIDToName(x, 'C', 0) if key in sortedContigDic: tmp = sortedContigDic[key] while tmp < len(sortedContigList) and sortedContigList[tmp][-2] == key: eachsub = sortedContigList[tmp] if overlapCR(eachsub, side, thres, lenDicContig, lenDicRead): rList.append(eachsub[-1]) tmp = tmp + 1 distinctRList = abunHouseKeeper.getDistinct(rList) else: distinctRList = [] return distinctRList
def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" ) lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def bipartiteLocalResolve(self, resolvedList, inList, outList, folderName): #noGoNext = self.readInJSON(folderName, "noGoNext.json") #noGoPrev = self.readInJSON(folderName, "noGoPrev.json") for u in inList: self.clearOut(u/2) for v in outList: self.clearIn(v/2) for e in resolvedList: u, v =e[0], e[-1] cName = abunHouseKeeper.parseIDToName(u,'C',0) nName = abunHouseKeeper.parseIDToName(v,'C',0) #if not cName in noGoNext and not nName in noGoPrev: self.insertEdge(u,v,1997)
def bipartiteLocalResolve(self, resolvedList, inList, outList, folderName): #noGoNext = self.readInJSON(folderName, "noGoNext.json") #noGoPrev = self.readInJSON(folderName, "noGoPrev.json") if len(resolvedList) > 0: for u in inList: self.clearOut(u/2) for v in outList: self.clearIn(v/2) for e in resolvedList: u, v =e[0], e[-1] cName = abunHouseKeeper.parseIDToName(u,'C',0) nName = abunHouseKeeper.parseIDToName(v,'C',0) #if not cName in noGoNext and not nName in noGoPrev: self.insertEdge(u,v,1997)
def getCtAgg(inList, myCountDic, Gnew, lenDic): newInList = [] print "New getCtAgg", len(inList) for i in inList: tmp1 = convert4to1base(i) tmp2 = convert4to2base(i) covTmp = 0 lenTmp = 0 print "len(Gnew.graphNodesList[tmp2].nodeIndexList), len(inList)", len( Gnew.graphNodesList[tmp2].nodeIndexList ), len(inList) for eachindex in Gnew.graphNodesList[tmp2].nodeIndexList: if lenDic[abunHouseKeeper.parseIDToName(eachindex, "C", len(lenDic))] > lenTmp: lenTmp = lenDic[abunHouseKeeper.parseIDToName(eachindex, "C", len(lenDic))] name = "Segkk" + str(eachindex / 2) covTmp = myCountDic[name] newInList.append([tmp2, covTmp]) return newInList
def findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead): rList = [] ''' Format : [S1] [E1] | [S2] [E2] | [LEN 1] [LEN 2] | [ IDY] | [TAGS] ===================================================================================== 1 562 | 819 1418 | 562 600 | 84.72 | Contig0_d Read121_d 1 562 | 4077 3478 | 562 600 | 84.72 | Contig0_d Read121_p 1 564 | 656 68 | 564 589 | 90.13 | Contig0_d Read382_d 1 564 | 6996 7584 | 564 589 | 90.13 | Contig0_d Read382_p 1 571 | 1386 815 | 571 572 | 86.60 | Contig0_d Read421_d ''' thres = thresMiddleContig key = abunHouseKeeper.parseIDToName(x, 'C', 0) if key in sortedContigDic: tmp = sortedContigDic[key] while tmp < len(sortedContigList) and sortedContigList[tmp][-2] == key: eachsub = sortedContigList[tmp] if overlapCR(eachsub, side, thres, lenDicContig,lenDicRead): rList.append(eachsub[-1]) tmp = tmp + 1 distinctRList = abunHouseKeeper.getDistinct(rList) else: distinctRList = [] return distinctRList
def singleGapLookUp(eachmatchpair,folderName, N1, mummerLink, contigReadGraph, contigFilename,readsetFilename): #print eachmatchpair leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0],eachmatchpair[-1],0,0,"" succReadsList = [] G = seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) succReadsList = BFS(leftCtgIndex,rightCtgIndex, G, N1) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList" , succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], 'R', N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList)-1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], 'R', N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i+1], 'R', N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], 'R', N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] return [leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent]
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename): print eachmatchpair leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, "" succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) succReadsList = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5) # shuffle(allPaths) print "allPaths", allPaths possibleList = [] for p in allPaths: noContig = True for pp in p[1:-1]: if pp < N1: noContig = False if noContig == True: possibleList.append(p) print "possibleList", possibleList minListLen = 1000 for p in possibleList: if len(p) < minListLen: succReadsList = p minListLen = len(p) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList", succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList) - 1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]