def abunSplit(folderName, mummerLink, myCountDic): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def replaceFiles(folderName, replacedName): commandList = [] commandList.append("cp " + folderName + "improved3.fasta " + folderName + "improved3_backup.fasta") commandList.append("cp " + folderName + "improved3_Double.fasta " + folderName + "improved3_backup.fasta") IORobot.writeToFile_Double1(folderName, replacedName[0:-6] + ".fasta", replacedName[0:-6] + "_Double.fasta", "contig") commandList.append("cp " + folderName + replacedName + " " + folderName + "improved3.fasta") command = "perl -pe 's/>[^\$]*$/\">Segkk\" . $n++ .\"\n\"/ge' " + folderName + "improved3.fasta > " + folderName + "newImproved3.fasta " commandList.append(command) command = "cp " + folderName + "newImproved3.fasta " + folderName + "improved3.fasta " commandList.append(command) commandList.append("cp " + folderName + replacedName[0:-6] + "_Double.fasta " + folderName + "improved3_Double.fasta") for eachcommand in commandList: print eachcommand os.system(eachcommand)
def abunSplit(folderName, mummerLink, myCountDic): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic)*2 G = graphLib.seqGraph(N1) for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic) addEdges(G, resolvedList) G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" ) lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def findNoGoByNoHeads(noGoList, side, folderName): noGoListNew = [] sortedContigList, sortedReadList, sortedContigDic, sortedReadDic =\ formSortedDataList(folderName) lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta") lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") for x in noGoList: rList = findAttachedReads(x, side, folderName, sortedContigList, sortedContigDic, lenDicContig, lenDicRead) cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig, lenDicRead) if bestMatchContigOnly == False: bestContigIDList = findBreakContigAdv(cList) else: bestContigIDList = findBreakContig(cList) if len(rList) > 0 and len(cList) > 0: print "x, side, len(rList), len(cList), len(bestContigIDList)",\ abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList) print "cList", bestContigIDList noGoListNew = noGoListNew + bestContigIDList return noGoListNew
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): ''' Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs ''' json_data = open(folderName + "phaseRepeat.txt", 'r') repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic)
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): """ Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta Output : abunsplit.fasta Algorithm : 1. Load data from various sources [various json files] 2. For each repeat interior: a) identify the abundances associated with in/out contigs b) perform a split and record the split 3. Use split results to generate contigs [may already exist in newPhasing.py ] a) use a graph to capture the split results b) use reads to fill in any gaps c) read out the contigs """ json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) N1 = len(myCountDic) * 2 print "N1", N1 G = graphLib.seqGraph(N1) gapContentLookUpList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) print "resolvedList", resolvedList gapContentLookUpList += generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename ) addEdges(G, resolvedList) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # some how change ASplitter here by appending necessary information G.condense() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic )
def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename): print "condenseEdgeRemove" thresPass = 100 thresForStrangeCut = 5000 ### kkdebug toRemoveList = [] for eachnode in self.graphNodesList: if len(eachnode.nodeIndexList) > 0: if len(eachnode.listOfNextNodes) ==1 : nextNodeIndex = eachnode.listOfNextNodes[0][0] nextNode= self.graphNodesList[nextNodeIndex] if len(nextNode.listOfPrevNodes) == 1 : currentName = eachnode.nodeIndex nextName = nextNode.nodeIndex contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5) cName = abunHouseKeeper.parseIDToName(currentName,'C',0) nName = abunHouseKeeper.parseIDToName(nextName,'C',0) noGoNext = self.readInJSON(folderName, "noGoNext.json") noGoPrev = self.readInJSON(folderName, "noGoPrev.json") overlap = [-1, -1] ctr = 0 for eachpath in contigReadPaths: if len(eachpath) > 2: ctr = ctr + 1 elif len(eachpath) == 2: contigName = cName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = nName rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) if ctr <= thresPass and (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ): self.removeEdge(currentName, nextName) toRemoveList.append([currentName, nextName]) ### kkdebug #with open( "dataFolder/toRemoveList.json", 'w') as f: # json.dump(toRemoveList, f) self.findAdjList()
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[ j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", 'w') as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [ eachitem[2], eachitem[3], eachitem[4] ] print eachitem[2:4], len(eachitem[4]) #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename): print "colorNodes" lenDic = IORobot.obtainLength(folderName, sourceFilename+".fasta") print lenDic thresForShort = 15000 shortList = [] longList = [] for eachitem in lenDic: if lenDic[eachitem] > thresForShort: longList.append(eachitem) else: shortList.append(eachitem) IORobot.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList) IORobot.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
def findCoverageFromRawData(folderName): contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") G = 0 NL = 0 for eachitem in contigLenDic: G = G+ contigLenDic[eachitem] for eachitem in readLenDic: NL = NL+ readLenDic[eachitem] c = (NL*1.0)/G print c return c
def colorNodes(folderName, mummerPath, sourceFilename, contigFilename, readsetFilename): print "colorNodes" lenDic = IORobot.obtainLength(folderName, sourceFilename + ".fasta") print lenDic thresForShort = 15000 shortList = [] longList = [] for eachitem in lenDic: if lenDic[eachitem] > thresForShort: longList.append(eachitem) else: shortList.append(eachitem) IORobot.putListToFileO(folderName, sourceFilename + ".fasta", contigFilename, longList) IORobot.putListToFileO(folderName, sourceFilename + ".fasta", readsetFilename, shortList)
def findCoverageFromRawData(folderName): contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") G = 0 NL = 0 for eachitem in contigLenDic: G = G + contigLenDic[eachitem] for eachitem in readLenDic: NL = NL + readLenDic[eachitem] c = (NL * 1.0) / G print c return c
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): ''' Algorithm: 1)Load ContigReadGraph and form xResolvedGraph 2)Transitive reduction and remove double pointers 3)Bipartite resolution 4)xResolve 5)Form gapLookUp 6)Read contigs out from graph 7)CheckAns and get it done today again... ''' if abunHouseKeeper.abunGlobalRunEM == True: emalgo.generateAssociatedReadDic(folderName) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDic) Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename) Gnew.logEdges(folderName, "graphsurgery") #Gnew.reportEdge() #assert(False) Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic, mummerLink) Gnew.logEdges(folderName, "BResolution") XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1, mummerLink) Gnew.logEdges(folderName, "XResolution") readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList), len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList) , len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def generateGapContentLookup( folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={} ): gapContentLookUpList = [] contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta") N1 = len(contigLenDic) * 2 resolvedList = [] print "mapDummyToRealDic", mapDummyToRealDic for eachmatchpair in oldResolvedList: tmpList = [] if eachmatchpair[0] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1] else: tmpList.append(eachmatchpair[0]) if eachmatchpair[-1] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1] else: tmpList.append(eachmatchpair[-1]) for ii in range(len(tmpList) - 1): resolvedList.append([tmpList[ii], tmpList[ii + 1]]) gapContentLookUpList = parallelGapLookUp( resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename ) return gapContentLookUpList
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph): json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") gapContentLookUpDic = {} furtherGapList = [] for i in range(N1): if len(G.graphNodesList[i].nodeIndexList) > 1: for j in range(len(G.graphNodesList[i].nodeIndexList) - 1): bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1] key = str(bk) + "_" + str(fwd) if not key in gapContentLookUpDic: furtherGapList.append([bk, fwd]) with open(folderName + "furtherGapList.json", "w") as f: json.dump(furtherGapList, f) furtherGapContentLookUpList = generateGapContentLookup( folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) for eachitem in furtherGapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") IORobot.extractGraphToContigs( G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic ) if True: nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def replaceFiles( folderName, replacedName) : commandList = [] commandList.append("cp " + folderName + "improved3.fasta " + folderName + "improved3_backup.fasta") commandList.append("cp " + folderName + "improved3_Double.fasta " + folderName + "improved3_backup.fasta") IORobot.writeToFile_Double1(folderName, replacedName[0:-6]+".fasta", replacedName[0:-6]+"_Double.fasta", "contig") commandList.append("cp " + folderName + replacedName + " "+folderName + "improved3.fasta") command = "perl -pe 's/>[^\$]*$/\">Segkk\" . $n++ .\"\n\"/ge' "+folderName+"improved3.fasta > "+folderName+"newImproved3.fasta " commandList.append(command) command = "cp " +folderName+"newImproved3.fasta "+folderName+"improved3.fasta " commandList.append(command) commandList.append("cp " + folderName + replacedName[0:-6]+"_Double.fasta " + folderName + "improved3_Double.fasta") for eachcommand in commandList: print eachcommand os.system(eachcommand)
def formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile): ''' Input : directPathList, indirectPathList, contigFile, readFile Output: directPath.fasta, indirectPath.fasta ''' contigList = IORobot.readContigsFromFile(folderName,contigFile) readList = IORobot.readContigsFromFile(folderName,readFile) directPathSeqList = IORobot.pathListToSeqListTransform(directPathList, contigList, readList, mummerPath, folderName) indirectPathSeqList = IORobot.pathListToSeqListTransform(indirectPathList, contigList, readList, mummerPath, folderName) IORobot.writeSegOut(directPathSeqList,folderName,"directPath.fasta") IORobot.writeSegOut(indirectPathSeqList,folderName,"indirectPath.fasta")
def formExtraEdges( folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/", optTypeFileHeader="phaseString", contigFilename="improved3", G=[], N1=0): dataList = alignerRobot.extractMumData(folderName, optTypeFileHeader + "CR" + "Out") dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") count = 0 tmpItem = [] embedContig2ReadDic, read2EmbedContigDic = {}, {} for key, items in groupby(dataList, itemgetter(-2)): isEmbedded = False for eachitem in items: #print eachitem if eachitem[4] > lenDic[key] - 300: isEmbedded = True tmpItem = eachitem if isEmbedded: count = count + 1 readName = tmpItem[-1] embedContig2ReadDic[key] = readName read2EmbedContigDic[readName] = key print "len(embedContig2ReadDic)", len(embedContig2ReadDic) #assert(False) for contigName in embedContig2ReadDic: readName = embedContig2ReadDic[contigName] readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID( readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C') for eachprev in G.graphNodesList[readIndex].listOfPrevNodes: idNode, wt = eachprev[0], eachprev[1] if idNode < N1: G.insertEdge(idNode, contigIndex, wt) for eachnext in G.graphNodesList[readIndex].listOfNextNodes: idNode, wt = eachnext[0], eachnext[1] if idNode < N1: G.insertEdge(contigIndex, idNode, wt) return G
def decideCut(folderName, mummerPath): ''' Input : directPath.fasta, indirectPath.fasta Output : toDelete ''' thres = 50 if True: alignerRobot.useMummerAlign(mummerPath, folderName, \ "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True) dataList = alignerRobot.extractMumData(folderName , "indirectvsdirectOut") lenDic = IORobot.obtainLength(folderName, "directPath.fasta") ctr =0 ctrindirect = 0 dataList.sort(key = itemgetter(-1)) toDelete = True for key, items in groupby(dataList, itemgetter(-1)): print "key", key ctr = ctr + 1 isFound = False for eachitem in items: if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres: isFound = True if isFound: ctrindirect = ctrindirect + 1 epsilon = 1.1 print "ctrindirect, ctr", ctrindirect, ctr if ctrindirect*1.0/ctr < (1- epsilon): toDelete = False else: toDelete = True return toDelete
def runningTestSet(self ,myFolderName, ctexpected, commandList, matchingContigFile): print "Integration test on RepeatPhaserMain: " + myFolderName self.sourceFolder = myFolderName os.system("rm -rf "+ self.testingFolder) os.system("mkdir " + self.testingFolder) for eachitem in self.listOfFiles: os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder) for eachcommand in commandList: os.system(eachcommand) lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile) assert(len(lenDic) == ctexpected) os.system("rm -rf "+ self.testingFolder)
def mapStrangePairs(): folderName = "Apr10Test/" json_data = open(folderName + "furtherGapList.json", 'r') furtherGapList = json.load(json_data) segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta") f = open(folderName + "wrongCondense.fasta", 'w') ctr = 0 for eachitem in furtherGapList: beforeI, afterI = eachitem[0], eachitem[1] f.write(">Segkk"+str(ctr)+"\n") f.write(segLookUp[beforeI]+"\n") ctr = ctr + 1 f.write(">Segkk"+str(ctr)+"\n") f.write(segLookUp[afterI]+"\n") ctr = ctr + 1 f.close() if False: alignerRobot.useMummerAlign("/usr/bin/", folderName, "wrongCondenseDebug", "reference.fasta", "wrongCondense.fasta") dataList = alignerRobot.extractMumData(folderName, "wrongCondenseDebugOut") dataList.sort(key = itemgetter(-1)) mappedDic = {} for key, items in groupby(dataList, itemgetter(-1)): print "key", key matchLen = -1 for eachitem in items: if eachitem[-4] > matchLen: mappedDic[key] = eachitem matchLen = eachitem[-4] for eachitem in mappedDic: print "results : ", eachitem, mappedDic[eachitem]
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
def runningTestSet(self, myFolderName, ctexpected, commandList, matchingContigFile): print "Integration test on RepeatPhaserMain: " + myFolderName self.sourceFolder = myFolderName os.system("rm -rf " + self.testingFolder) os.system("mkdir " + self.testingFolder) for eachitem in self.listOfFiles: os.system("cp " + self.sourceFolder + eachitem + " " + self.testingFolder) for eachcommand in commandList: os.system(eachcommand) lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile) assert (len(lenDic) == ctexpected) os.system("rm -rf " + self.testingFolder)
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) # cut here adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
def checkPathLength(path, G, N1, folderName): lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") sumLength = 0 overlapLength = 0 for index, i in zip(path, range(len(path))): header = "Read" + str((index - N1) / 2) + "_" if (index - N1) % 2 == 0: header = header + "p" else: header = header + "d" print "lenDicRR[header], ", lenDicRR[header], header print (index - N1) * 2 + 1, (index - N1) * 2 + 2 sumLength = sumLength + lenDicRR[header] if i != len(path) - 1: for eachnext in G.graphNodesList[index].listOfNextNodes: if eachnext[0] == path[i + 1]: overlapLength = overlapLength + eachnext[1] break print sumLength, overlapLength, sumLength - overlapLength
def checkPathLength(path, G, N1, folderName): lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") sumLength = 0 overlapLength = 0 for index, i in zip(path, range(len(path))): header = "Read" + str((index - N1) / 2) + "_" if (index - N1) % 2 == 0: header = header + "p" else: header = header + "d" print "lenDicRR[header], ", lenDicRR[header], header print(index - N1) * 2 + 1, (index - N1) * 2 + 2 sumLength = sumLength + lenDicRR[header] if i != len(path) - 1: for eachnext in G.graphNodesList[index].listOfNextNodes: if eachnext[0] == path[i + 1]: overlapLength = overlapLength + eachnext[1] break print sumLength, overlapLength, sumLength - overlapLength
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): """ Algorithm: 1)Load ContigReadGraph and form xResolvedGraph 2)Transitive reduction and remove double pointers 3)Bipartite resolution 4)xResolve 5)Form gapLookUp 6)Read contigs out from graph 7)CheckAns and get it done today again... """ lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDic) Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename) Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic) XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1) readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
def generateGapContentLookup(folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={}): gapContentLookUpList = [] contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta") N1 = len(contigLenDic) * 2 resolvedList = [] print "mapDummyToRealDic", mapDummyToRealDic for eachmatchpair in oldResolvedList: tmpList = [] if eachmatchpair[0] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1] else: tmpList.append(eachmatchpair[0]) if eachmatchpair[-1] >= N1: tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1] else: tmpList.append(eachmatchpair[-1]) for ii in range(len(tmpList) - 1): resolvedList.append([tmpList[ii], tmpList[ii + 1]]) gapContentLookUpList = abunGraphLib.parallelGapLookUp( resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename) return gapContentLookUpList
def viewLenDic(): folderName = "Apr10Test/" json_data = open(folderName + "myCountDic.json", 'r') myCountDic = json.load(json_data) contigLenDic = IORobot.obtainLength(folderName, "LC_n.fasta") toPlotListX = [] toPlotListY = [] for eachitem in contigLenDic: toPlotListX.append(myCountDic[eachitem]) toPlotListY.append(contigLenDic[eachitem]) print toPlotListX, toPlotListY with open(folderName + "toPlotListX.json", 'w') as f: json.dump(toPlotListX, f) with open(folderName + "toPlotListY.json", 'w') as f: json.dump(toPlotListY, f)
def test1(): lenDic = {} coverageDic = {} lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta") f = open("/Users/kakitlam/Documents/abundata", 'r') tmp = f.readline() while len(tmp) > 0: if len(tmp) > 10: myitem = tmp[0:-1].split() coverageDic[myitem[0]] = float(myitem[1]) tmp = f.readline() f.close() myList = [] baseCt = {} for eachitem in lenDic: myList.append(lenDic[eachitem]*coverageDic[eachitem]) baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem] for eachitem in lenDic : print eachitem, baseCt[eachitem] for eachitem in lenDic : print eachitem, lenDic[eachitem] for eachitem in lenDic : print eachitem, coverageDic[eachitem]
def outputResults(folderName, mummerLink, toPhaseList, N1, G): ''' Algorithm : a) Write as contigs b) Add back reverse complement c) Create G2 as the readOut part d) Output the contigs by a function call ''' # a) combinedName = "contigAndRead_Double.fasta" os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName) fout = open(folderName + combinedName, 'a') fin = open(folderName + "phasingSeedName_Double.fasta", 'r') tmp = fin.readline().rstrip() while len(tmp) > 0: if tmp[0] != ">": fout.write(tmp + "\n") else: infoArr = tmp[5:].split("_") fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2)) fout.write("_" + infoArr[1] + "\n") tmp = fin.readline().rstrip() fin.close() fout.close() # b) ''' [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1 [2 , 690, 28, 212, 0] ''' completePhaseList = [] for eachitem in toPhaseList: repeat = eachitem[-3] flanking = eachitem[-2] result = eachitem[-1] revrepeat = [] for eachsub in eachitem[-3][-1::-1]: revrepeat.append(eachsub + pow(-1, eachsub)) revflanking = [[] for i in range(4)] for j in range(2): for eachsub in eachitem[-2][j + 2][-1::-1]: revflanking[j].append(eachsub + pow(-1, eachsub)) for eachsub in eachitem[-2][j][-1::-1]: revflanking[j + 2].append(eachsub + pow(-1, eachsub)) revresult = eachitem[-1] completePhaseList.append([repeat, flanking, result]) completePhaseList.append([revrepeat, revflanking, revresult]) print "completePhaseList", completePhaseList # c) G2 = graphLib.seqGraph(N1) nameDic = {} for i in range(N1): nameDic[i] = i for eachitem in completePhaseList: repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2] path = [[], []] if result == 0: path[0] = flanking[0][0:-1] + repeat + flanking[2][1:] path[1] = flanking[1][0:-1] + repeat + flanking[3][1:] else: path[0] = flanking[0][0:-1] + repeat + flanking[3][1:] path[1] = flanking[1][0:-1] + repeat + flanking[2][1:] print path[0] , path[1] for i in range(2): eachpath = path[i] currentNode = G2.graphNodesList[eachpath[0]] for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))): if ctr != len(eachpath[1:]) - 1: myindex = len(G2.graphNodesList) nameDic[myindex] = nextNodeIndex newNode = graphLib.seqGraphNode(myindex) G2.graphNodesList.append(newNode) else: newNode = G2.graphNodesList[nextNodeIndex] wt = 0 for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes: if eachck[0] == nextNodeIndex: wt = eachck[1] break newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt]) currentNode.listOfNextNodes.append([newNode.nodeIndex, wt]) currentNode = newNode graphFileName = "phaseGraphFinal" G2.condense() G2.saveToFile(folderName, graphFileName) IORobot.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName, needAlignment=True): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" #if needAlignment: # alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) if needAlignment: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[header, referenceFile, queryFile, ""]], houseKeeper.globalParallel) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if needAlignment: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' addDataToList(dataListCC, G, 0, 0, 'C', 'C') addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1) Gnew.saveToFile(folderName, graphName) print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename): print eachmatchpair leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, "" succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1) succReadsList = [] G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5) # shuffle(allPaths) print "allPaths", allPaths possibleList = [] for p in allPaths: noContig = True for pp in p[1:-1]: if pp < N1: noContig = False if noContig == True: possibleList.append(p) print "possibleList", possibleList minListLen = 1000 for p in possibleList: if len(p) < minListLen: succReadsList = p minListLen = len(p) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList", succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList) - 1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName( leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]] return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" : print "debug" , eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" : print "debug" , eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName ,referenceFile, queryFile, mummerLink, header ) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def performPhasing(folderName, mummerLink): print "performPhasing" ''' 1. Interface from alignmentBridge.py : shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList) cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True) 2. Format of input data data : bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) 3. IO : a) Input : repeatSpecification.txt, phasingSeedName_Double.fasta, graph G b) Output : improved4.fasta 3. Algorithm: a) reformatNoisyReads b) reformatToProcessList c) formShortToLongMapping ''' json_data = open(folderName + 'repeatSpecification.txt', 'r') loadData = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta") lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) loadData = filterReverseComp(loadData, N1) toPhaseList = [] if True: for eachitem in loadData: # print eachitem flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1) toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1) shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1) indelRobot = createIndelRobot(folderName) cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True) if extendResult != -1: print "extendResult: ", extendResult toPhaseList.append(eachitem + [extendResult]) with open(folderName + 'toPhaseList.txt', 'w') as outfile: json.dump(toPhaseList, outfile) json_data = open(folderName + 'toPhaseList.txt', 'r') toPhaseList = json.load(json_data) outputResults(folderName, mummerLink, toPhaseList, N1, G)
def resolvingTandem( folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec ): print "resolvingTandem" """ Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs """ # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, "r") loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i = 0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i + 1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk] - N1 nextitem = tandemPath[kk + 1] - N1 readName = "Read" + str(eachitem / 2) + "_" nextReadName = "Read" + str(nextitem / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" if nextitem % 2 == 0: nextReadName = nextReadName + "p" elif nextitem % 2 == 1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, "w") fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge = repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList = [] for eachitem in repeatReadList: readName = "Read" + str((eachitem - N1) / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key=itemgetter(-1)) for key, values in groupby(dataList, itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] # print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch * 1.0 / (c * lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1] - N1 contigName = "Contig" + str(startContig / 2) if startContig % 2 == 0: contigName = contigName + "_p" elif startContig % 2 == 1: contigName = contigName + "_d" readName = "Read" + str(firstRead / 2) if firstRead % 2 == 0: readName = readName + "_p" elif firstRead % 2 == 1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName + ";" + readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", "w") f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign( mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta" ) dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out") dataList.sort(key=itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0: repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct * lrepeat) print "repeatStart", repeatStart happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", "w") counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C") if checkingList[id / 2] == False: fout.write(">Segkk" + str(counter) + "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk / 2] = True fout.close()
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i, adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem": newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0: leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([ abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList) ]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert (loadData == repeatList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta" if True: alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = alignerRobot.extractMumData(folderName, header + "Out") dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta" lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") if not abunHouseKeeper.abunGlobalRRDisable: if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) for eachitem in dataListRR: if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p": print "debug", eachitem if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p": print "debug", eachitem dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR) else: dataListRR = [] header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta" if True: alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink, header) #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = graphLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename): N1 = len(myCountDic) * 2 print "N1", N1 # Debug G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = graphLib.seqGraph(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i, j, 1) Gnew.reportEdge() # End Debug if False: json_data = open(folderName + "phaseRepeat.txt", "r") repeatPairs = json.load(json_data) repeatPairs = obtainNonEmpty(repeatPairs) biResolvedCombineList = [] for eachitem in repeatPairs: inList, outList = eachitem[0], eachitem[1] resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1) biResolvedCombineList += resolvedList ### Xnode repeatResolution xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph) ### Combine resolution resolvedList = xResolvedList + biResolvedCombineList resolvedList = abunHouseKeeper.getDistinct(resolvedList) print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len( resolvedList ), len(xResolvedList), len(biResolvedCombineList) with open(folderName + "resolvedList.json", "w") as f: json.dump(resolvedList, f) with open(folderName + "mapDummyToRealDic.json", "w") as f: json.dump(mapDummyToRealDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) gapContentLookUpList = [] gapContentLookUpList = generateGapContentLookup( folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic ) gapContentLookUpDic = {} gapContentLookUpList.sort() for eachitem in gapContentLookUpList: gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]] print eachitem[2:4], len(eachitem[4]) with open(folderName + "gapContentLookUpDic.json", "w") as f: json.dump(gapContentLookUpDic, f) if False: json_data = open(folderName + "resolvedList.json", "r") resolvedList = json.load(json_data) json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(N1 + len(mapDummyToRealDic)) addEdges(G, resolvedList) G.condense() G.saveToFile(folderName, "xResolvedGraph") if False: json_data = open(folderName + "mapDummyToRealDic.json", "r") mapDummyToRealDic = json.load(json_data) G = graphLib.seqGraph(0) G.loadFromFile(folderName, "xResolvedGraph") json_data = open(folderName + "gapContentLookUpDic.json", "r") gapContentLookUpDic = json.load(json_data) print "Final step: really hacking a file" os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta") contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta") f = open(folderName + "tmpWithDummy.fasta", "a") for i in range(len(mapDummyToRealDic)): id = mapDummyToRealDic[str(i)] f.write(">SegDum" + str(i) + "\n") f.write(contigList[id] + "\n") f.close() IORobot.extractGraphToContigs( G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic )
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0 : repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = abunGraphLib.seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) Grev = abunGraphLib.formReverseGraph(G) json_data = open(folderName + repeatFilename, 'r') repeatList = json.load(json_data) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)", len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1): print rIn, rOut if (len(rIn) == 1 and len(rOut) == 1): rIn = [rIn[0], rIn[0]] rOut = [rOut[0], rOut[0]] # 1. Records reachable indices kkIn , kkOut = [], [] for eachkk in rIn: kkIn.append(str(eachkk)+"_"+"in") for eachkk in rOut: kkOut.append(str(eachkk)+"_"+"out") abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1) # 2. Marks inside nodes singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway #checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList) # ## Experimental repeatList = allPassList # ## End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + repeatSpec, 'w') as outfile: json.dump(bigDumpList, outfile)
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter): myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = 0 dataList.sort(key=itemgetter(-1)) ctkk, ctbase = 0, 0 toAddBackDic = copy.deepcopy(readLenDic) for key, items in groupby(dataList, itemgetter(-1)): maxMatch = -1 bestname = "" for eachitem in items: ct = eachitem[6] / 100.0 * eachitem[4] if ct > maxMatch: maxMatch = ct bestname = eachitem[-2] myCountDic[bestname] += readLenDic[key] ctkk = ctkk + 1 ctbase = ctbase + readLenDic[key] toAddBackDic[key] = -1 cttot = 0 for eachitem in readLenDic: cttot = cttot + readLenDic[eachitem] print "Missed coverage ", (cttot - ctbase) / (4.7 * pow(10, 6)) print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic)) toAddReadList = [] for eachitem in toAddBackDic: if toAddBackDic[eachitem] >= 0: toAddReadList.append(eachitem) """ This part need the most parallelism because it is most intense with -l 10 split V, workerList V , combine """ if continueFilter: numberOfFiles = 20 IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList) bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = ( bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta" ) os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName = ( "outAbunRefine" + indexOfMum, "improved3.fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch( mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True ) alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles) for i in range(len(myCountDic)): eachitem = "Segkk" + str(i) print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem]) myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem]) return myCountDic
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename) G2 = abunGraphLib.seqGraphWt(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def continuousIntegration(): if False: G = graphLib.seqGraph(10) for i in range(5): G.insertEdge(i,i+1,1997) G.insertEdge(i,i+2, 1997) resultList = abunGraphLib.BFS_revisit(1,3,G,1) print "resultList", resultList if False : folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \ "Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta" abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile) if False: lenDic = IORobot.obtainLength(folderName , contigFile) N1 = len(lenDic) print "N1", N1 G = graphLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") adj = [[] for i in range(N1)] for i in range(N1): adj[i] = abunGraphLib.findAllReachable(i, N1, G) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) Gnew.initAdv() Gnew.doubleEdgeReduction() contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3) contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5) print "contigPaths", contigPaths print "contigReadPaths", contigReadPaths Gnew.transitiveReduction() if False: toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/") print toDelete if False: G = graphLib.seqGraph(0) G.loadFromFile("Apr10TestA/", "xResolvedGraph") if False: for i in range(len(G.graphNodesList)): v = G.graphNodesList[i] if len(v.nodeIndexList) > 0: print i , v.listOfPrevNodes , v.listOfNextNodes G.reportEdge() lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta") mylist = [401, 207, 405, 407, 344] json_data = open("Apr10TestA/" + "myCountDic.json", 'r') myCountDic = json.load(json_data) for x in mylist: print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)] if False: folderName = "Apr10TestA/" G = graphLib.seqGraph(0) G.loadFromFile(folderName , "xResolvedGraph") json_data = open(folderName + "mapDummyToRealDic.json", 'r') mapDummyToRealDic = json.load(json_data) lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta") print len(G.graphNodesList) print len(mapDummyToRealDic) print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic) if False: abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/") if False: nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/") if False: folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1" G = graphLib.seqGraph(0) kthres, edgeThres = 3, 1 G.loadFromFile(folderName, contigReadGraph) lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta") N1 = len(lenDic) adj = [[] for i in range(N1)] for i in range(N1): tmpList = abunGraphLib.findAllReachable(i, N1, G) for j in tmpList: if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres: adj[i].append(j) #print i, adj[i] ### Filter adaptor skipped case adaptorPair = [] for i in range(len(adj)): if i % 2 == 0: if i + 1 in adj[i]: adj[i].remove(i+1) adaptorPair.append([i, i+1]) elif i % 2 ==1: if i-1 in adj[i] : adj[i].remove(i-1) adaptorPair.append([i, i-1]) Gnew = abunGraphLib.seqGraphDynamic(N1) for i in range(N1): for j in adj[i]: Gnew.insertEdge(i,j,1997) for eachpair in adaptorPair: u, v = eachpair[0], eachpair[1] for x in Gnew.graphNodesList[u].listOfPrevNodes: xIndex = x[0] Gnew.removeEdge(xIndex, v) for y in Gnew.graphNodesList[v].listOfNextNodes: yIndex = y[0] Gnew.removeEdge(u, yIndex) #Gnew.reportEdge() count2 = 0 for i in range(len(Gnew.graphNodesList)): if len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and len(Gnew.graphNodesList[i].listOfNextNodes) == 2: count2 = count2 + 1 print str(i)+"{color:red}" print "count2, ", count2 ### End filter adaptor skipped case if True: nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3") command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = houseKeeper.globalParallelFileNum if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" ''' 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V ''' numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) ''' "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum ''' outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta", "outAbun" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) ''' dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out") ''' 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass ''' lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList= [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" ) else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False) with open(folderName + 'myCountDic.json', 'w') as f: json.dump(myCountDic, f) return myCountDic
def generateAbundanceGraph(folderName, mummerLink): print "generateAbundanceGraph" """ 1. Find your favorite mappers to map read back a. MUMmer, Bowtie, bbmap, any that works V b. And then write a short parser to parse the results V """ numberOfFiles = 20 workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) """ "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum """ outputName, referenceName, queryName, specialName = ( "outAbun" + indexOfMum, "improved3.fasta", "raw_reads.part-" + indexOfMum + ".fasta", "outAbun" + indexOfMum, ) workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False) """ command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum os.system(command) """ dataList = [] for i in range(1, 1 + numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out") """ 2. Calculate count on the abundances a. Aggregate by taking average [put weights on bin along contigs] b. Inheritance and a subclass """ lenDic = IORobot.obtainLength(folderName, "improved3.fasta") readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta") myCountDic = {} for eachitem in lenDic: myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])] thres = 30 lenSum = 0 extraDataList = [] print "len(dataList)", len(dataList) if not abunHouseKeeper.abunGlobalAvoidrefine: myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True) extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut") else: extraDataList = [] dataList = dataList + extraDataList myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False) with open(folderName + "myCountDic.json", "w") as f: json.dump(myCountDic, f) return myCountDic
def getAllAssociatedReads(folderName, mummerLink,forFastaName): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = abunHouseKeeper.abunGlobalReadSearchDepth print "N: ", N if N >0 : for trial in range(N): print "trial", trial numberOfFiles = 20 if True: workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta", header + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False) dataList = [] for i in range(1, 1+numberOfFiles): if i < 10: indexOfMum = "0" + str(i) else: indexOfMum = str(i) dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out") filterList = [] lenDicRR = IORobot.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command) else: os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
import matplotlib.pyplot as plt from finisherSCCoreLib import IORobot lenDic = {} coverageDic = {} lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta") f = open("/Users/kakitlam/Documents/abundata", 'r') tmp = f.readline() while len(tmp) > 0: if len(tmp) > 10: myitem = tmp[0:-1].split() coverageDic[myitem[0]] = float(myitem[1]) tmp = f.readline() f.close() myList = [] baseCt = {} for eachitem in lenDic: myList.append(lenDic[eachitem] * coverageDic[eachitem]) baseCt[eachitem] = lenDic[eachitem] * coverageDic[eachitem] for eachitem in lenDic: print eachitem, baseCt[eachitem] for eachitem in lenDic: print eachitem, lenDic[eachitem]
def singleGapLookUp(eachmatchpair,folderName, N1, mummerLink, contigReadGraph, contigFilename,readsetFilename): #print eachmatchpair leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0],eachmatchpair[-1],0,0,"" succReadsList = [] G = seqGraphWt(0) G.loadFromFile(folderName, contigReadGraph) succReadsList = BFS(leftCtgIndex,rightCtgIndex, G, N1) if len(succReadsList) > 0: succReadsList.pop(0) succReadsList.pop(-1) else: print "interesting item for future study" print "succReadsList" , succReadsList if len(succReadsList) == 0: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1) leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap contig : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" else: contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1) print contigName leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) readName = abunHouseKeeper.parseIDToName(succReadsList[0], 'R', N1) print readName rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap start read : ", overlap leftEnd = len(leftSeg) - overlap[0] middleContent = "" for i in range(len(succReadsList)-1): readName = abunHouseKeeper.parseIDToName(succReadsList[i], 'R', N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) readName = abunHouseKeeper.parseIDToName(succReadsList[i+1], 'R', N1) rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap middle read : ", overlap middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] readName = abunHouseKeeper.parseIDToName(succReadsList[-1], 'R', N1) leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName) contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1) rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName) overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) ) print "overlap end read : ", overlap middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] return [leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent]