def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel ) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) # print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if not os.path.isfile(folderName + "selfOut"): alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres) nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) #print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta') removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]], houseKeeper.globalParallel, ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta") removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ "self", houseKeeper.globalContigName, houseKeeper.globalContigName, "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def mainFlow(folderName, mummerLink, pickupname, mapcontigsname): print "Go Bears! ! !" print "pickupname, mapcontigsname", pickupname, mapcontigsname if not pickupname in [ "noEmbed.fasta", "improved.fasta", "improved2.fasta" ]: nonRedundantResolver.removeEmbedded(folderName, mummerLink) if not pickupname in ["improved.fasta", "improved2.fasta"]: overlapResolver.fetchSuccessor(folderName, mummerLink) overlapResolver.formSeqGraph(folderName, mummerLink) if not pickupname in ["improved2.fasta"]: gapFiller.fillGap(folderName, mummerLink) twoRepeatOneBridgeSolver.xPhased(folderName, mummerLink) # ECReduction(folderName , mummerLink ) # compareWithReference(folderName , mummerLink) IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "noEmbedtmp.fasta", "noEmbed.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improvedtmp.fasta", "improved.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved2tmp.fasta", "improved2.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved3tmp.fasta", "improved3.fasta") if mapcontigsname != None: houseKeeper.performMapping(folderName, mummerLink, mapcontigsname) print "<3 Do cool things that matter <3"
def mainFlow(folderName , mummerLink, pickupname, mapcontigsname): print "Go Bears! ! !" print "pickupname, mapcontigsname", pickupname, mapcontigsname if not pickupname in ["noEmbed.fasta", "improved.fasta", "improved2.fasta"]: nonRedundantResolver.removeEmbedded(folderName , mummerLink) if not pickupname in ["improved.fasta", "improved2.fasta"]: overlapResolver.fetchSuccessor(folderName , mummerLink) overlapResolver.formSeqGraph(folderName , mummerLink) if not pickupname in ["improved2.fasta"]: gapFiller.fillGap(folderName , mummerLink) twoRepeatOneBridgeSolver.xPhased(folderName , mummerLink) # ECReduction(folderName , mummerLink ) # compareWithReference(folderName , mummerLink) IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "noEmbedtmp.fasta", "noEmbed.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improvedtmp.fasta", "improved.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved2tmp.fasta", "improved2.fasta") IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved3tmp.fasta", "improved3.fasta") if mapcontigsname != None: houseKeeper.performMapping(folderName, mummerLink, mapcontigsname) print "<3 Do cool things that matter <3"
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, "contigs.fasta") removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def formSeqGraph(folderName , mummerLink): print "formSeqGraph" startList, graphNodes = [], [] print "formSeqGraph: Reading in best successors and predecessors" rightConnect = readConnectList(folderName, "rightConnect.txt") leftConnect = readConnectList(folderName, "leftConnect.txt") numberOfNodes = len(rightConnect) print "formSeqGraph: Initializing seqGraph" G = graphLib.seqGraph(numberOfNodes) print "formSeqGraph: Adding edges to seqGraph" for eachitem, i in zip(rightConnect, range(len(rightConnect))): index = i connector, weight = eachitem G.insertEdge(index, connector, weight) for eachitem, i in zip(leftConnect, range(len(leftConnect))): index = i connector, weight = eachitem G.insertEdge(connector, index, weight) G.cleanEdge() G.condense() print "formSeqGraph: Outputting seqGraph to condensedGraph.txt" G.saveToFile(folderName, "condensedGraph.txt") G.checkSelfLoops() G.checkCompleteness() G2 = graphLib.seqGraph(0) G2.loadFromFile(folderName, "condensedGraph.txt") houseKeeper.compareGraphUnitTest(G, G2) G.reportDummyUsefulNode() G.reportEdge() graphFileName = "condensedGraph.txt" contigFile = "noEmbed_Double.fasta" outContigFile = "improved.fasta" outOpenList = "openZone.txt" print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
def formSeqGraph(folderName, mummerLink): print "formSeqGraph" startList, graphNodes = [], [] print "formSeqGraph: Reading in best successors and predecessors" rightConnect = readConnectList(folderName, "rightConnect.txt") leftConnect = readConnectList(folderName, "leftConnect.txt") numberOfNodes = len(rightConnect) print "formSeqGraph: Initializing seqGraph" G = graphLib.seqGraph(numberOfNodes) print "formSeqGraph: Adding edges to seqGraph" for eachitem, i in zip(rightConnect, range(len(rightConnect))): index = i connector, weight = eachitem G.insertEdge(index, connector, weight) for eachitem, i in zip(leftConnect, range(len(leftConnect))): index = i connector, weight = eachitem G.insertEdge(connector, index, weight) G.cleanEdge() G.condense() print "formSeqGraph: Outputting seqGraph to condensedGraph.txt" G.saveToFile(folderName, "condensedGraph.txt") G.checkSelfLoops() G.checkCompleteness() G2 = graphLib.seqGraph(0) G2.loadFromFile(folderName, "condensedGraph.txt") houseKeeper.compareGraphUnitTest(G, G2) G.reportDummyUsefulNode() G.reportEdge() graphFileName = "condensedGraph.txt" contigFile = "noEmbed_Double.fasta" outContigFile = "improved.fasta" outOpenList = "openZone.txt" print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
def observeOverlap(folderName): dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') matchThres = 10000 nonMatchThres = 500 count = 0 newDataList = [] for eachitem in dataList: name1, name2 = eachitem[-2], eachitem[-1] matchLen1, matchLen2 = eachitem[4], eachitem[5] start1, end1, start2, end2 = eachitem[0], eachitem[1], eachitem[ 2], eachitem[3] # if name1!= name2 and min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \ # and min(start1, start2) > nonMatchThres \ if name1!= name2 and ( min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \ or min(start1, start2) > nonMatchThres ) \ and matchLen1> matchThres: print "eachitem ", eachitem, lenDic[name1], lenDic[name2] count = count + 1 newDataList.append(eachitem) print "Count: " + str(count) blkDic = getBreakPointFromDataList(folderName, newDataList) LCList = IORobot.loadContigsFromFile(folderName, "contigs.fasta") contigList = [] for eachcontig in LCList: #print eachcontig if not eachcontig in blkDic: contigList = contigList + [LCList[eachcontig]] else: contigList = contigList + tmpBreakAcBkPts(LCList[eachcontig], blkDic[eachcontig]) print "len(contigList)", len(contigList) IORobot.writeSegOut(contigList, folderName, "breakChains.fasta")
def runningTestSet(self ,myFolderName, ctexpected): print "Integration test on FinisherSC: " + myFolderName self.sourceFolder = myFolderName os.system("mkdir " + self.testingFolder) for eachitem in self.listOfFiles: os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder) os.system("python finisherSC.py -par 4 "+ self.testingFolder + " "+ self.mummerPath) lenDic = IORobot.obtainLength(self.testingFolder, "/improved3.fasta") print lenDic assert(len(lenDic) == ctexpected) os.system("rm -rf "+ self.testingFolder)
def getBreakPointFromDataList(folderName, dataList): g = 1000 blkDic = {} dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, "contigs.fasta") json_data = open(folderName + "modifiedOutliners.json", 'r') breakPtsDic = json.load(json_data) sep = 5000 for key, items in groupby(dataList, itemgetter(-2)): contigName = key newList = [] for eachitem in items: newList.append([eachitem[0], eachitem[1]]) newList.sort() bktmp = [0] if newList[0][0] > g: if withinBound(sep, breakPtsDic[contigName], newList[0][0]): bktmp.append(newList[0][0]) #bktmp.append(newList[0][0]) for i in range(len(newList) - 1): if newList[i + 1][0] > newList[i][1] + g: if withinBound(sep, breakPtsDic[contigName], newList[i + 1][0]): bktmp.append(newList[i + 1][0]) bktmp.append(lenDic[contigName]) blkDic[contigName] = bktmp print "contigName: " + contigName print "bktmp:", bktmp print "breakPtsDic[contigName]", breakPtsDic[contigName] return blkDic
def formRelatedReadsFile(folderName, mummerLink): # Find associated read and extract into a file associatedReads.fasta # Input: contigs.fasta, cleaned_Reads.fasta # Output: relatedReads.fasta # ## Extract heads of the contigs print ">formRelatedReadsFile" f = open(folderName + "improved.fasta", 'r') f2 = open(folderName + "improvedTrunc.fasta", 'w') temp = f.readline() tempContig = "" thres = 400 runningIndex = 0 endThres = 10 while len(temp) > 0: if temp[-1] == '\n': temp = temp[0:-1] if temp[0] == '>': if len(tempContig) > 0: IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 tempContig = "" else: tempContig = tempContig + temp temp = f.readline() IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 f2.close() f.close() # ## Write double stranded reads IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig") # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read") # ## Apply MUMMER on them using cleanedReads against them assoiatedReadIndex = [] nameList = [] numberOfFiles = max(20, houseKeeper.globalParallel) if True: bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True) # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum ) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum os.system(command) ''' for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) f = open(folderName + "fromMum" + indexOfMum, 'r') for i in range(6): tmp = f.readline() while len(tmp) > 0: infoArr = tmp.split('|') myArr = infoArr[-1].split('\t') rdGpArr = infoArr[-1].split('\t') contigName = rdGpArr[0].rstrip().lstrip() readName = rdGpArr[1].rstrip().lstrip() endSegArr = infoArr[0].split(" ") pos = [] for eachitem in endSegArr: if len(eachitem) > 0: pos.append(int(eachitem)) startPos = pos[0] endPos = pos[1] if startPos < endThres and endPos > thres - endThres: assoiatedReadIndex.append(myArr[1]) nameList.append([int(contigName.split('_')[1]), readName]) tmp = f.readline() f.close() nameList.sort() assoiatedReadIndex.sort() # print "assoiatedReadIndex", assoiatedReadIndex ckIndex = 0 f = open(folderName + "associatedNames.txt", 'w') oneItem = 0 keyFound = [] for key, items in groupby(assoiatedReadIndex): countItem = 0 for eachitem in items: countItem += 1 if countItem == 1: oneItem += 1 else: key = key.rstrip() if not key in keyFound: f.write(key + '\n') keyFound.append(key) ckIndex += 1 print "ckIndex,oneItem: ", ckIndex, oneItem f.close() fFilter = open(folderName + "associatedNames.txt", 'r') fout = open(folderName + "associatedNames2.txt", 'w') maxCount = 12000 mytmpDum = fFilter.readline() i = 0 while i < maxCount and len(mytmpDum) > 0: fout.write(mytmpDum) mytmpDum = fFilter.readline() i = i + 1 fout.close() fFilter.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta" os.system(command) IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
def writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink): # ## repeat aware logging # print "myExtraLinkList", myExtraLinkList # ## end repeat aware logging myExtraLinkList = loggingReadsToRepeat(blockedSet + dataSet, contigList) i = 0 fOriginal = open(folderName + "improved.fasta", 'r') readSet = [] tmp = fOriginal.readline().rstrip() tmpRead = "" while len(tmp) > 0: if tmp[0] == '>': if len(tmpRead) > 0: readSet.append(tmpRead) tmpRead = "" else: tmpRead = tmpRead + tmp tmp = fOriginal.readline().rstrip() readSet.append(tmpRead) fOriginal.close() # ## Put the needed rawReads into the RAM using Dictionary fAppendRaw = open(folderName + "appendRaw.txt", 'w') for eachraw in rawReadList: fAppendRaw.write(eachraw) fAppendRaw.write('\n') fAppendRaw.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "appendRaw.txt " + folderName + "relatedReads_Double.fasta > " + folderName + "rawToAppend.fasta" os.system(command) rawRead = {} fOriginal = open(folderName + "rawToAppend.fasta", 'r') tmp = fOriginal.readline().rstrip() tmpRead = "" tmpName = "" while len(tmp) > 0: if tmp[0] == '>': if len(tmpRead) > 0: rawRead[tmpName] = tmpRead tmpRead = "" tmpName = tmp[1:] else: tmpRead = tmpRead + tmp tmp = fOriginal.readline().rstrip() rawRead[tmpName] = tmpRead # ## End seqToPrint = [] contigUsed = [False for i in range(numberOfContig / 2)] storedStrand = [[-1, 'n'] for i in range(numberOfContig)] finalList = [] for eachContig, i in zip(contigList, range(len(contigList))): tmpList = [] for eachitem in eachContig: readNum = eachitem / 2 if contigUsed[readNum] == False: seqToPrint.append(eachitem) tmpList.append(eachitem) contigUsed[readNum] = True # ## mark ouput strandinfo storedStrand[eachitem] = [len(finalList), 'p'] if len(tmpList) > 0: finalList.append(tmpList) for kkk in range(len(storedStrand)): if storedStrand[kkk][1] == 'n': if kkk % 2 == 0: storedStrand[kkk][0] = storedStrand[kkk + 1][0] storedStrand[kkk][1] = 'd' else: storedStrand[kkk][0] = storedStrand[kkk - 1][0] storedStrand[kkk][1] = 'd' # ## begin stored output blocked pairs blockExtraStored(storedStrand, myExtraLinkList, folderName) # ## end output blocked pairs stored fImproved = open(folderName + "improved2.fasta", 'w') for eachcontig, dummyIndex in zip(finalList, range(len(finalList))): fImproved.write(">Segkk" + str(dummyIndex) + '\n') tmpStore = -1997 tmpStore2 = -1998 tmpStore3 = -1999 for eachseg, hidum in zip(eachcontig, range(len(eachcontig))): readNum = eachseg / 2 orientation = eachseg % 2 newStart = 0 # Begin hack ### old statement x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2 ### End old statement if hidum == 0: x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2 else: prevseg = eachcontig[hidum-1] prevReadNum = prevseg/2 prevOrient = prevseg %2 if prevOrient == 0: leftSeg = readSet[prevReadNum] else: leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum]) rightSeg = tmpStore3 overlapX = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) leftSeg = tmpStore3 if orientation == 0: rightSeg = readSet[readNum] else: rightSeg = houseKeeper.reverseComplement(readSet[readNum]) overlapY = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) print "Before : x, y , l : ", x, y , l x = overlapX[1] y = overlapY[0] l = tmpStore2 print "After : x, y , l : ", x, y , l # End hack extraRead = "" if hidum == 0: newStart = 0 else: if l < x + y: # begin hack ### old statement newStart = x + y - l ### end old statement prevseg = eachcontig[hidum-1] prevReadNum = prevseg/2 prevOrient = prevseg %2 if prevOrient == 0: leftSeg = readSet[prevReadNum] else: leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum]) if orientation == 0: rightSeg = readSet[readNum] else: rightSeg = houseKeeper.reverseComplement(readSet[readNum]) print "Before : ", newStart overlapNewStart = IORobot.align(leftSeg, rightSeg, folderName, mummerLink) newStart = overlapNewStart[1] print "After : ", newStart # end hack else: newStart = 0 if option == 'polish': print "Missing polish" extraRead = tmpStore3[x:l - y] # extraRead = performPolishing(leftConnect[eachseg][0], eachseg, tmpStore3[x:l-y], dataSet, folderName) else: extraRead = tmpStore3[x:l - y] print extraRead[0:10], len(extraRead) fImproved.write(extraRead) if orientation == 0: fImproved.write(readSet[readNum][newStart:]) else: fImproved.write(houseKeeper.reverseComplement(readSet[readNum])[newStart:]) if rightConnect[eachseg][1] != -1: tmpStore = rightConnect[eachseg][1] tmpStore2 = len(rawRead[rightConnect[eachseg][2]]) tmpStore3 = rawRead[rightConnect[eachseg][2]] fImproved.write('\n') fImproved.close()
def extractEdgeSet(folderName, mummerLink, option="nopolish"): # Tasks: reconstruct the string graph # Input : relatedReads_Double.fasta, conig_Double.fasta # Intermediate files: fromMum_overlap , fromMum_overlap # Output: connectivity of eachNode: InList, OutList [critical] # connectivity of eachNode: arrow representation with size [optional] # ## Perform MUMMER alignment print ">Extract Edge set" contigOnlyLengthDic = IORobot.obtainLength(folderName, "improved.fasta") # print lengthDic lengthDic = IORobot.findContigLength(folderName, "improved") numberOfContig = len(contigOnlyLengthDic)*2 K = 400 thres = 5 # ## Apply MUMMER on them using cleanedReads against them IORobot.truncateEndOfContigs(folderName, "improved_Double.fasta", "smaller_improvedContig.fasta", 25000, lengthDic) dataSet = [] numberOfFiles = max(20, houseKeeper.globalParallel) if True: bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "relatedReads_Double.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True) # alignerRobot.useMummerAlign(mummerLink, folderName, "outRefine", "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", True, "fromMumRefine" + indexOfMum) for dummyI in range(1, numberOfFiles + 1): tmpSet = IORobot.obtainLinkInfoReadContig(dummyI, mummerLink, folderName,thres, lengthDic, K) dataSet = dataSet + tmpSet # ## repeat aware usableJunction = loadOpenList(folderName) dataSet, blockedSet = filterRepeatEnd(dataSet, usableJunction) # ## repeat aware end dataSet.sort() matchPair = formMatchPairFromReadInfo(dataSet) # Bug fix on repeat detection from reads alone matchPair = filterRepeatPair(matchPair) # end bug fix # print matchPair bestMatchPair = [] for key, items in groupby(matchPair, itemgetter(0, 1)): maxvalue = -1 maxLenPair = [] for eachitem in items: if eachitem[2] > maxvalue: maxvalue = eachitem[2] maxLenPair = [eachitem[3], eachitem[4], eachitem[5]] bestMatchPair.append([key[0], key[1], maxvalue, maxLenPair[0], maxLenPair[1], maxLenPair[2]]) contigList, leftConnect, rightConnect, rawReadList = formbestpair(bestMatchPair,numberOfContig) print "contigList", contigList writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink)
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def fetchSuccessor(folderName , mummerLink): print "fetchSuccessor" left_connect, right_connect = [], [] print "Direct greedy" print "fetchSuccessor: Aligning non-contained contigs to themselves, output files are greedy*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy") # [next_item, overlap_length] leftConnect = [[-1, -1] for i in range(numberOfContig)] rightConnect = [[-1, -1] for i in range(numberOfContig)] dataSet.sort(reverse=True, key=itemgetter(1)) print "fetchSuccessor: Finding best successors" for key, items in groupby(dataSet, itemgetter(1)): # if key == "Contig217_d": # print "dddd" maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[2] prefix = myName.split('_') suffix = connectorName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(rightConnect[prefixContig][0] == -1) rightConnect[prefixContig][0] = suffixContig rightConnect[prefixContig][1] = lengthOfOverlap dataSet.sort(reverse=True, key=itemgetter(2)) print "fetchSuccessor: Finding best predecessors" for key, items in groupby(dataSet, itemgetter(2)): maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[1] prefix = connectorName.split('_') suffix = myName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(leftConnect[suffixContig][0] == -1) leftConnect[suffixContig][0] = prefixContig leftConnect[suffixContig][1] = lengthOfOverlap print "fetchSuccessor: Outputting best successors to rightConnect.txt" # ## Write to file: f = open(folderName + 'rightConnect.txt', 'w') for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close() print "fetchSuccessor: Outputting best predecessors to leftConnect.txt" f = open(folderName + 'leftConnect.txt', 'w') for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close()
def xPhased(folderName , mummerLink): # ## Repeat resolution [Proxy for MB] # 1. Re-form the contig string graph with ALL connections from contigs only V # 2. Log down the reads and associated blocked contigs V # 3. Use reads to connect; # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important); # 5. Read out contigs print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb") lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta") confidenLenThres = 0 print "xPhased: Building seqGraph" G = graphLib.seqGraph(numberOfContig) extraEdges = loadEdgeFromBlockedReads(folderName) for eachitem in dataSet: # print eachitem wt, myin, myout = eachitem myInData = myin[6:].split('_') myOutData = myout[6:].split('_') if myInData[1] == 'p': offsetin = 0 else: offsetin = 1 if myOutData[1] == 'p': offsetout = 0 else: offsetout = 1 i = int(myInData[0]) * 2 + offsetin j = int(myOutData[0]) * 2 + offsetout ck = False for eachedge in extraEdges: mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3] if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres: ck = True if ck: G.insertEdge(i, j, wt) # G.reportEdge() G.MBResolve() G.reportEdge() print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt" G.saveToFile(folderName, "condensedGraphMB.txt") graphFileName = "condensedGraphMB.txt" contigFile = "improved2_Double.fasta" outContigFile = "improved3.fasta" outOpenList = "openZoneMB.txt" print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta" IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList) # ## Repeat resolution [Proxy for phasing step] # 6. Find out the repeat region by MSA # 7. Find out the location of SNPs and extend across repeat # [short cut : use contig creator : your job here is to get data into the correct formats] print "xPhased"
def fetchSuccessor(folderName , mummerLink): print "fetchSuccessor" left_connect, right_connect = [], [] print "Direct greedy" numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy") # [next_item, overlap_length] leftConnect = [[-1, -1] for i in range(numberOfContig)] rightConnect = [[-1, -1] for i in range(numberOfContig)] dataSet.sort(reverse=True, key=itemgetter(1)) for key, items in groupby(dataSet, itemgetter(1)): # if key == "Contig217_d": # print "dddd" maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[2] prefix = myName.split('_') suffix = connectorName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(rightConnect[prefixContig][0] == -1) rightConnect[prefixContig][0] = suffixContig rightConnect[prefixContig][1] = lengthOfOverlap dataSet.sort(reverse=True, key=itemgetter(2)) for key, items in groupby(dataSet, itemgetter(2)): maxVal = -1 myName = key connectorName = "" for eachsubitem in items: if eachsubitem[0] > maxVal: maxVal = eachsubitem[0] connectorName = eachsubitem[1] prefix = connectorName.split('_') suffix = myName.split('_') lengthOfOverlap = maxVal if prefix[1] == 'p': prefixContig = int(prefix[0][6:]) * 2 else: prefixContig = int(prefix[0][6:]) * 2 + 1 if suffix[1] == 'p': suffixContig = int(suffix[0][6:]) * 2 else: suffixContig = int(suffix[0][6:]) * 2 + 1 assert(leftConnect[suffixContig][0] == -1) leftConnect[suffixContig][0] = prefixContig leftConnect[suffixContig][1] = lengthOfOverlap # ## Write to file: f = open(folderName + 'rightConnect.txt', 'w') for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close() f = open(folderName + 'leftConnect.txt', 'w') for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))): f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n') f.close()