def removeEdgeInBatch(G, noGoPrev, noGoNext): for v in noGoPrev: G.clearIn(abunHouseKeeper.parseEdgeNameToID(v, 'C')) for v in noGoNext: G.clearOut(abunHouseKeeper.parseEdgeNameToID(v, 'C')) return G
def formExtraEdges( folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/", optTypeFileHeader="phaseString", contigFilename="improved3", G=[], N1=0): dataList = alignerRobot.extractMumData(folderName, optTypeFileHeader + "CR" + "Out") dataList.sort(key=itemgetter(-2)) lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") count = 0 tmpItem = [] embedContig2ReadDic, read2EmbedContigDic = {}, {} for key, items in groupby(dataList, itemgetter(-2)): isEmbedded = False for eachitem in items: #print eachitem if eachitem[4] > lenDic[key] - 300: isEmbedded = True tmpItem = eachitem if isEmbedded: count = count + 1 readName = tmpItem[-1] embedContig2ReadDic[key] = readName read2EmbedContigDic[readName] = key print "len(embedContig2ReadDic)", len(embedContig2ReadDic) #assert(False) for contigName in embedContig2ReadDic: readName = embedContig2ReadDic[contigName] readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID( readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C') for eachprev in G.graphNodesList[readIndex].listOfPrevNodes: idNode, wt = eachprev[0], eachprev[1] if idNode < N1: G.insertEdge(idNode, contigIndex, wt) for eachnext in G.graphNodesList[readIndex].listOfNextNodes: idNode, wt = eachnext[0], eachnext[1] if idNode < N1: G.insertEdge(contigIndex, idNode, wt) return G
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList), len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList) , len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def addDataToList(dataList, G, startIndex1, startIndex2, type1, type2): threshold = 50 for eachitem in dataList: wt = min(eachitem[4] , eachitem[5]) if eachitem[0] < threshold: j = abunHouseKeeper.parseEdgeNameToID(eachitem[-2], type1) + startIndex1 i = abunHouseKeeper.parseEdgeNameToID(eachitem[-1], type2) + startIndex2 else: j = abunHouseKeeper.parseEdgeNameToID(eachitem[-1], type2) + startIndex2 i = abunHouseKeeper.parseEdgeNameToID(eachitem[-2], type1) + startIndex1 G.insertEdge(i, j, wt)
def addDataToList(dataList, G, startIndex1, startIndex2, type1, type2): threshold = 50 for eachitem in dataList: wt = min(eachitem[4], eachitem[5]) if eachitem[0] < threshold: j = abunHouseKeeper.parseEdgeNameToID(eachitem[-2], type1) + startIndex1 i = abunHouseKeeper.parseEdgeNameToID(eachitem[-1], type2) + startIndex2 else: j = abunHouseKeeper.parseEdgeNameToID(eachitem[-1], type2) + startIndex2 i = abunHouseKeeper.parseEdgeNameToID(eachitem[-2], type1) + startIndex1 G.insertEdge(i, j, wt)
def resolvingTandem( folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec ): print "resolvingTandem" """ Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs """ # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, "r") loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i = 0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i + 1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk] - N1 nextitem = tandemPath[kk + 1] - N1 readName = "Read" + str(eachitem / 2) + "_" nextReadName = "Read" + str(nextitem / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" if nextitem % 2 == 0: nextReadName = nextReadName + "p" elif nextitem % 2 == 1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, "w") fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge = repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList = [] for eachitem in repeatReadList: readName = "Read" + str((eachitem - N1) / 2) + "_" if eachitem % 2 == 0: readName = readName + "p" elif eachitem % 2 == 1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key=itemgetter(-1)) for key, values in groupby(dataList, itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] # print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch * 1.0 / (c * lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1] - N1 contigName = "Contig" + str(startContig / 2) if startContig % 2 == 0: contigName = contigName + "_p" elif startContig % 2 == 1: contigName = contigName + "_d" readName = "Read" + str(firstRead / 2) if firstRead % 2 == 0: readName = readName + "_p" elif firstRead % 2 == 1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName + ";" + readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", "w") f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign( mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta" ) dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out") dataList.sort(key=itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0: repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct * lrepeat) print "repeatStart", repeatStart happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", "w") counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C") if checkingList[id / 2] == False: fout.write(">Segkk" + str(counter) + "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk / 2] = True fout.close()
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data thres = 5 G = graphLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = alignerRobot.extractMumData(folderName, header + "Out") dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[2] < thres: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = alignerRobot.extractMumData(folderName, header + "Out") dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[2] < thres: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = findCoverageFromRawData(folderName) # print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm if len(maxItm) > 0 : repeatStart = maxItm[0] contigEnd = maxItm[2] else: repeatStart = 0 contigEnd = -1 # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()