def performPhasing(folderName, mummerLink): print "performPhasing" ''' 1. Interface from alignmentBridge.py : shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList) cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True) 2. Format of input data data : bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) 3. IO : a) Input : repeatSpecification.txt, phasingSeedName_Double.fasta, graph G b) Output : improved4.fasta 3. Algorithm: a) reformatNoisyReads b) reformatToProcessList c) formShortToLongMapping ''' json_data = open(folderName + 'repeatSpecification.txt', 'r') loadData = json.load(json_data) G = commonLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta") lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) for eachitem in loadData: print eachitem flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1) toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1) shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal,dicToOriginal, lenDicCR, N1 ) indelRobot = createIndelRobot(folderName) cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init") in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote") extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True) if extendResult != -1: print "extendResult: ", extendResult assert(1==2)
def filterEdge(adjacencyList, folderName, contigFilename): lenDic = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") thresFoPhase = 2000 smallList, largeList = [], [] for eachitem in lenDic: id = parseEdgeNameToID(eachitem, 'C') if lenDic[eachitem] < thresFoPhase: smallList.append(id) else: largeList.append(id) newAdjacencyList = [[] for i in range(len(adjacencyList))] for i in largeList: for eachitem in adjacencyList[i]: ######## IMPORTANT: if eachitem in largeList and eachitem / 2 != i / 2: ######## NEED TO REMOVE IN PRODUCTION if True newAdjacencyList[i].append(eachitem) print "len(smallList) , len(largeList): ", len(smallList) , len(largeList) print "lenDic: ", lenDic for eachitem in newAdjacencyList: print "newAdjacencyList :", eachitem return newAdjacencyList
def getAllAssociatedReads(folderName, mummerLink): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' forFastaName = "phasingSeedName" header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = 1 for trial in range(N): print "trial", trial if False: command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + header + " " + folderName + referenceFile + " " + folderName + queryFile os.system(command) command = mummerLink + "show-coords -r " + folderName + header + ".delta > " + folderName + header + "Out" os.system(command) dataList = commonLib.extractMumData(folderName, header + "Out") filterList = [] lenDicRR = commonLib.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command)
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename): print "colorNodes" lenDic = commonLib.obtainLength(folderName, sourceFilename+".fasta") print lenDic thresForShort = 15000 shortList = [] longList = [] for eachitem in lenDic: if lenDic[eachitem] > thresForShort: longList.append(eachitem) else: shortList.append(eachitem) commonLib.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList) commonLib.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
def connectContigs(toPhase, toRemove, toBR, folderName, mummerLink): print "\nConnect Contigs" tmpList = [] delThres =20000 lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta") for eachitem in toRemove: tmpList.append(eachitem/2) tmpList.sort() removeContigIndexList = [] for key, items in groupby(tmpList): name = "Contig"+ str(key)+"_p" if lenDic[name] < delThres: removeContigIndexList.append(2*key) removeContigIndexList.append(2*key+1) print "removeContigIndexList", removeContigIndexList ### toRemove ===> remove both strand when detected G = commonLib.seqGraph(len(lenDic)) ### hack ! make the nodeIndexList to be empty for empty nodes for eachnode in G.graphNodesList: if eachnode.nodeIndex in removeContigIndexList: eachnode.nodeIndexList = [] # form a graph, .condense, then use readContigOut ### add edge for eachedge in toBR: i = eachedge[0]/2 j = eachedge[1]/2 wt = eachedge[3]+1 print "i, j, wt", i, j, wt G.insertEdge(i, j, wt) tmpFileName = "xphasebonus" G.condense() G.saveToFile(folderName,tmpFileName ) commonLib.readContigOut(folderName, mummerLink, tmpFileName, "improved3_Double.fasta", "improved4.fasta", tmpFileName+"Open")
def checkPathLength(path, G, N1, folderName): lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta") sumLength = 0 overlapLength = 0 for index, i in zip(path, range(len(path))): header = "Read" + str((index - N1) / 2) + "_" if (index - N1) % 2 == 0: header = header + "p" else: header = header + "d" print "lenDicRR[header], ", lenDicRR[header], header print (index - N1) * 2 + 1, (index - N1) * 2 + 2 sumLength = sumLength + lenDicRR[header] if i != len(path) - 1: for eachnext in G.graphNodesList[index].listOfNextNodes: if eachnext[0] == path[i + 1]: overlapLength = overlapLength + eachnext[1] break print sumLength, overlapLength, sumLength - overlapLength
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) Grev = formReverseGraph(G) json_data = open(folderName + repeatFilename, 'r') repeatList = json.load(json_data) lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)", len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1): print rIn, rOut if (len(rIn) == 1 and len(rOut) == 1): rIn = [rIn[0], rIn[0]] rOut = [rOut[0], rOut[0]] # 1. Records reachable indices kkIn , kkOut = [], [] for eachkk in rIn: kkIn.append(str(eachkk)+"_"+"in") for eachkk in rOut: kkOut.append(str(eachkk)+"_"+"out") markReachableIndices(G, Grev, kkIn, kkOut, N1) # 2. Marks inside nodes singleMissList, allPassList = markInsideNodes(G, kkIn, kkOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway #checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList) # ## Experimental repeatList = allPassList # ## End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + repeatSpec, 'w') as outfile: json.dump(bigDumpList, outfile)
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun ): ''' Input : Graph --- phaseStringGraph1 Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } Algorithm: a) Reachability test on the graph to find the partners b) Form Bipartite graph c) Find connected component in the bipartite and define as repeat pairs ''' # ## (a) reachability test to find partners G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) # G.reportEdge() lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") adjacencyList = [[] for i in range(len(lenDicCC))] N1 = len(lenDicCC) # # Debug # for i in range(14): # debugGraphPath(i, 2, G, N1) # # End Debug for i in range(len(lenDicCC)): adjacencyList[i] = findAllReachable(i, N1, G) print "i, adjacencyList[i] : ", i , adjacencyList[i] # ## (b) formation of bipartite graph if optionToRun == "tandem" : newAdjacencyList = adjacencyList elif optionToRun == "xphase": newAdjacencyList = filterEdge(adjacencyList, folderName, contigFilename) G2 = commonLib.seqGraph(N1 * 2) for i in range(N1): for j in newAdjacencyList[i]: G2.insertEdge(2 * i, 2 * j + 1, 1) G2.insertEdge(2 * j + 1, 2 * i, 1) clusters = G2.findConnectedComponents() repeatList = [] for eachitem in clusters: leftList, rightList = [], [] for eachsubitem in eachitem: if eachsubitem % 2 == 0 : leftList.append(eachsubitem) else: rightList.append(eachsubitem) repeatList.append([getDistinct(leftList), getDistinct(rightList)]) with open(folderName + repeatFilename, 'w') as outfile: json.dump(repeatList, outfile) json_data = open(folderName + repeatFilename, 'r') loadData = json.load(json_data) assert(loadData == repeatList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = commonLib.extractMumData(folderName, header + "Out") dataListCC = filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = filterData(dataListRR, lenDicRR) header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = commonLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def defineRepeatAndFlanking(folderName, mummerLink): ''' Input : V a) String graph : G V b) Repeat Pairing : repeatList Output : V a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) V b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] ) V c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24]) Algorithm : V 1. Find repeat by graph operations V 2. Find flanking region by graph operations V 3. Find associated reads by graph operations ''' print "defineRepeatAndFlanking: " # 0. Load previous data G = commonLib.seqGraph(0) G.loadFromFile(folderName, "phaseStringGraph1") Grev = formReverseGraph(G) json_data = open(folderName + 'phaseRepeat.txt', 'r') repeatList = json.load(json_data) lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta") N1 = len(lenDicCC) print "repeatList: ", repeatList print "len(G.graphNodesList)", len(G.graphNodesList) bigDumpList = [] print "len(repeatList)",len(repeatList) , repeatList for r in repeatList: rIn, rOut = [], [] for eachitem in r[0]: rIn.append(eachitem / 2) for eachitem in r[1]: rOut.append(eachitem / 2) if len(rIn) == 2 and len(rOut) == 2: print rIn, rOut # 1. Records reachable indices kkIn , kkOut = [],[] ''' for eachnext in G.graphNodesList[4].listOfNextNodes: print 4, eachnext kkIn.append(eachnext[0]) for eachprev in G.graphNodesList[6].listOfPrevNodes: print 6, eachprev kkOut.append(eachprev[0]) print set(kkIn).intersection(set(kkOut)) print len( G.graphNodesList[0].listOfNextNodes), len( G.graphNodesList[2].listOfNextNodes) print len( G.graphNodesList[1].listOfPrevNodes), len( G.graphNodesList[3].listOfPrevNodes) print len( Grev.graphNodesList[0].listOfPrevNodes), len( Grev.graphNodesList[2].listOfPrevNodes) print len( Grev.graphNodesList[1].listOfNextNodes), len( Grev.graphNodesList[3].listOfNextNodes) ''' markReachableIndices(G, Grev, rIn, rOut, N1) # 2. Marks inside nodes singleMissList, allPassList = markInsideNodes(G, rIn, rOut) for i in range(4): print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList) # 3. Finds start/end of repeat myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList) print myStartIndex, myEndIndex # 4. Find repeat interior by shortest path joining S/E repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1) print "repeatPathway", repeatPathway checkPathLength(repeatPathway, G, N1, folderName) # 5. Find flanking region by shortest path search again flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1) print flankingPathsList # 6. Find associated reads by graph node query flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList) ### Experimental repeatList = allPassList ### End Experimental for eachlist in flankingList: print len(eachlist), len(repeatList) bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList]) # 7. Format return and move on to the phasing with open(folderName + 'repeatSpecification.txt', 'w') as outfile: json.dump(bigDumpList, outfile)
def defineRegionOfInterest(folderName , mummerLink): # Form inInfo and outInfo for [ name and endUsed ] # Define [terminating loc] for inInfo and outInfo print "defineRegionOfInterest" # commonLib.writeToFile_Double1(folderName, "improved3.fasta", "improved3_Double.fasta", "contig") # commonLib.useMummerAlign(mummerLink, folderName, "phasing", "improved3_Double.fasta", "improved3_Double.fasta") dataList = commonLib.extractMumData(folderName, "phasing" + "Out") lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta") print "Record length of contigs" for eachitem in lenDic: print lenDic[eachitem], eachitem print "\nPerform alignment and group associated contigs" # Convention : 0_p_L, 0_p_R, 0_d_L, 0_d_R N = len(lenDic) * 2 clusterList = [] for i in range(N): clusterList.append(clusterElem(i)) if i % 2 == 0: clusterList[i].terminatingLoc = 0 else: clusterList[i].terminatingLoc = lenDic[parseIDToName(i)[0:-2]] oppoPairList = [] for eachitem in dataList: terminatingLoc, resultOfCk = checkSameSideRequirement(eachitem, lenDic) isOppMatch, pair = checkOppositeSideRequirement(eachitem, lenDic) if isOppMatch: index1 = parseContigName(pair[0], 'R') index2 = parseContigName(pair[1], 'L') oppoPairList.append([index1, index2, pair[2], pair[3]]) if resultOfCk == 'L' or resultOfCk == 'R': index1 = parseContigName(eachitem[-2], resultOfCk) index2 = parseContigName(eachitem[-1], resultOfCk) union(clusterList[index1], clusterList[index2]) if resultOfCk == 'L': if clusterList[index1].terminatingLoc < terminatingLoc[0]: clusterList[index1].terminatingLoc = terminatingLoc[0] if clusterList[index2].terminatingLoc < terminatingLoc[1]: clusterList[index2].terminatingLoc = terminatingLoc[1] elif resultOfCk == 'R': if clusterList[index1].terminatingLoc > terminatingLoc[0]: clusterList[index1].terminatingLoc = terminatingLoc[0] if clusterList[index2].terminatingLoc > terminatingLoc[1]: clusterList[index2].terminatingLoc = terminatingLoc[1] headList = [] for eachitem in clusterList: if find(eachitem) == eachitem: headList.append(eachitem) for eachitem in headList: for eachsub in familyList(eachitem): print parseIDToName(eachsub.id), eachsub.terminatingLoc, print nFamily = len(headList) # Define the match of inInfo vs outInfo [matchList] oppoPairList.sort() for key, items in groupby(oppoPairList, itemgetter(0, 1)): # print parseIDToName(key[0]), parseIDToName(key[1]) find(clusterList[key[0]]).voteList.append(key[1]) find(clusterList[key[1]]).voteList.append(key[0]) matchList = [] for eachitem in headList: if eachitem.id % 2 == 1: successorIndex = eachitem.findSuccessor() if successorIndex != -1: matchList.append([eachitem.id, successorIndex]) repeatList = [] for eachitem in matchList: if eachitem[0] != -1 and eachitem[1] != -1: inList = [] for eachsubitem in familyList(find(clusterList[eachitem[0]])): inList.append([eachsubitem.id, eachsubitem.terminatingLoc]) outList = [] for eachsubitem in familyList(find(clusterList[eachitem[1]])): outList.append([eachsubitem.id, eachsubitem.terminatingLoc]) repeatList.append([inList, outList]) # Filter the embedded contigs globalRemoveList = [] for eachitem in repeatList: inList = eachitem[0] outList = eachitem[1] toRemoveList = [] for aitem in inList: for bitem in outList: if aitem[0] / 2 == bitem[0] / 2 : toRemoveList.append([aitem, bitem]) globalRemoveList.append(aitem[0] / 2) for eachsub in toRemoveList: if eachsub[0] in inList: inList.remove(eachsub[0]) if eachsub[1] in outList: outList.remove(eachsub[1]) print "\nRepeats and in/out contigs" for i in range(len(repeatList)): print "(repeatList[i][0]),(repeatList[i][1]): ", (repeatList[i][0]), (repeatList[i][1]) print "globalRemoveList: ", globalRemoveList print "oppoPairList", oppoPairList # Define the repeat contigDic = commonLib.loadContigsFromFile(folderName, "improved3_Double.fasta") newRepeatList = [] newBRList = [] print "\nRepeat interior and defining flanking region" for eachrepeat in repeatList: # ## Get the initial trial inReadList = [] outReadList = [] for eachitem in eachrepeat[0]: inReadList.append(eachitem[0]) for eachitem in eachrepeat[1]: outReadList.append(eachitem[0]) tmpLink = [] for eachoppoPair in oppoPairList: if eachoppoPair[0] in inReadList and eachoppoPair[1] in outReadList: tmpLink = eachoppoPair break if len(tmpLink) > 0: f1Read, f2Read = tmpLink[0], tmpLink[1] f1 , a1, f2, a2 = -1, tmpLink[2], -1 , tmpLink[3] for eachitem in eachrepeat[0]: if eachitem[0] == f1Read: f1 = eachitem[1] for eachitem in eachrepeat[1]: if eachitem[0] == f2Read: f2 = eachitem[1] print "f1Read, f2Read, f1, a1, f2, a2:\t ", f1Read, f2Read, f1, a1, f2, a2 f1tilde , f2tilde = f1, f2 # ## Refine it for myrecord in dataList: myid = parseContigName(myrecord[-2], 'R') otherid = parseContigName(myrecord[-1], 'R') if myid == f1Read and otherid != myid and otherid in inReadList: if checkSameSideRequirement(myrecord, lenDic): myStart = myrecord[0] if myStart > f1tilde: f1tilde = myStart for myrecord in dataList: myid = parseContigName(myrecord[-2], 'L') otherid = parseContigName(myrecord[-1], 'L') if myid == f2Read and otherid != myid and otherid in outReadList: if checkSameSideRequirement(myrecord, lenDic): myEnd = myrecord[1] if myEnd < f2tilde: f2tilde = myEnd # ## Output the loc indices and read from the real contig to get the repeat out print "f1Read, f2Read, f1tilde, a1, f2tilde, a2: \t", f1Read, f2Read, f1tilde, a1, f2tilde, a2 f1Read_parsed = parseIDToName(f1Read)[0:-2] f2Read_parsed = parseIDToName(f2Read)[0:-2] print "f1Read_parsed, f2Read_parsed", f1Read_parsed, f2Read_parsed, lenDic[f1Read_parsed], lenDic[f2Read_parsed] if a2 < f2tilde: repeatSegment = contigDic[f1Read_parsed][f1tilde:] + contigDic[f2Read_parsed][a2:f2tilde] else: repeatSegment = contigDic[f1Read_parsed][f1tilde:f2tilde - a2] print "len(repeatSegment)", len(repeatSegment) # ## Put to repeat, remove from repeat, add toBR if a2 >= f2tilde or a1 <= f1tilde: newBRList.append([f1Read, f2Read, a1, a2]) tmpSeg = [[], [], repeatSegment] for eachin in eachrepeat[0]: if eachin[0] != f1Read: tmpSeg[0].append(eachin) for eachout in eachrepeat[1]: if eachout[0] != f2Read: tmpSeg[1].append(eachout) if len(tmpSeg[0]) == 1 and len(tmpSeg[1]) == 1: # TODO inIndex = tmpSeg[0][0][0] outIndex = tmpSeg[1][0][0] found = False # if repeat exists , then fill in the blanks # otherwise, fill 0, 0. a1New, a2New = -1, 0 for secondrecord in dataList: isOppMatch, pair = checkOppositeSideRequirement(secondrecord, lenDic) if isOppMatch: index1 = parseContigName(pair[0], 'R') index2 = parseContigName(pair[1], 'L') if index1 == inIndex and index2 == outIndex: oppoPairList.append([index1, index2, pair[2], pair[3]]) if not found: newBRList.append([inIndex, outIndex, -1, 0]) else: newBRList.append([inIndex, outIndex, a1New, a2New]) elif len(tmpSeg[0]) >= 1 and len(tmpSeg[1]) >= 1: newRepeatList.append(tmpSeg) else: if len(eachrepeat[0]) == 1 and len(eachrepeat[1]) == 1: newBRList.append([f1Read, f2Read, a1, a2]) elif len(eachrepeat[0]) >= 1 and len(eachrepeat[1]) >= 1: newRepeatList.append([ repeatSegment, eachrepeat[0], eachrepeat[1]]) # Format output # Rmk: if only 1 copy is left, addToBR; if 0 left, remove that repeat toPhase = newRepeatList toRemove = globalRemoveList toBR = newBRList print "Items to be returned to next step:" print "toPhase", len(toPhase) print "toRemove", len(toRemove) print "toBR", len(toBR) , toBR connectContigs(toPhase, toRemove, toBR, folderName, mummerLink)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = newPhasing.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[1] > eachitem[3]: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = newPhasing.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[1] > eachitem[3]: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = commonLib.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = 50 # Important parameters : FIX needed in production #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta") print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm repeatStart = maxItm[0] contigEnd = maxItm[2] # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = newPhasing.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()