def getAllAssociatedReads(folderName, mummerLink): ''' Input : relatedReads.fasta, raw_reads.fasta Output : all_associated_reads.fasta Algorithm : a) Get all the associated reads b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total i) Align the raws and tmp_seedReads ii) Put the new reads into the SeedReads ''' forFastaName = "phasingSeedName" header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta" command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile os.system(command) N = 1 for trial in range(N): print "trial", trial if False: command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + header + " " + folderName + referenceFile + " " + folderName + queryFile os.system(command) command = mummerLink + "show-coords -r " + folderName + header + ".delta > " + folderName + header + "Out" os.system(command) dataList = commonLib.extractMumData(folderName, header + "Out") filterList = [] lenDicRR = commonLib.obtainLength(folderName, queryFile) print "len(dataList)", len(dataList) for eachitem in dataList: if checkSatisfy(eachitem, lenDicRR): filterList.append(eachitem) filterList.sort(key=itemgetter(-1)) newReads = [] for key, items in groupby(filterList, itemgetter(-1)): newReads.append(key) f = open(folderName + forFastaName + ".txt", 'w') for eachitem in newReads: f.write(eachitem + "\n") f.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta" os.system(command)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName): ''' Input : all_associated_reads.fasta, improved3.fasta Output : (G) String Graph linking the reads and contigs Algorithm: a) Form double reads and contigs V b) Mummer the data and extract dataList three times V c) Use the subroutine to output a graph V d) Output the graph to a file phasing_String_graph.graph V ''' G = [] commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig") commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads") header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") dataListCC = commonLib.extractMumData(folderName, header + "Out") dataListCC = filterData(dataListCC, lenDicCC) header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = filterData(dataListRR, lenDicRR) header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta" if True: commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile) lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = filterData(dataListCR, lenDicCR) numberOfNodes = len(lenDicCR) G = commonLib.seqGraph(numberOfNodes) N1, N2 = len(lenDicCC), len(lenDicRR) print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes ''' e.g. of dataListCC[0], dataListRR[0], dataListCR[0] [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d'] [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p'] [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d'] ''' # print dataListCC[0] # print dataListRR[0] # print dataListCR[0] # for eachitem in dataListCC: # print eachitem addDataToList(dataListCC, G, 0, 0, 'C', 'C') # for eachitem in dataListRR[0:10]: # print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]] addDataToList(dataListRR, G, N1, N1, 'R', 'R') addDataToList(dataListCR, G, 0, N1, 'C', 'R') # G.reportEdge() G.saveToFile(folderName, graphName) checkGraphLength(G, N1, lenDicRR) # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes) print "len(G.graphNodesList)", len(G.graphNodesList)
def defineRegionOfInterest(folderName , mummerLink): # Form inInfo and outInfo for [ name and endUsed ] # Define [terminating loc] for inInfo and outInfo print "defineRegionOfInterest" # commonLib.writeToFile_Double1(folderName, "improved3.fasta", "improved3_Double.fasta", "contig") # commonLib.useMummerAlign(mummerLink, folderName, "phasing", "improved3_Double.fasta", "improved3_Double.fasta") dataList = commonLib.extractMumData(folderName, "phasing" + "Out") lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta") print "Record length of contigs" for eachitem in lenDic: print lenDic[eachitem], eachitem print "\nPerform alignment and group associated contigs" # Convention : 0_p_L, 0_p_R, 0_d_L, 0_d_R N = len(lenDic) * 2 clusterList = [] for i in range(N): clusterList.append(clusterElem(i)) if i % 2 == 0: clusterList[i].terminatingLoc = 0 else: clusterList[i].terminatingLoc = lenDic[parseIDToName(i)[0:-2]] oppoPairList = [] for eachitem in dataList: terminatingLoc, resultOfCk = checkSameSideRequirement(eachitem, lenDic) isOppMatch, pair = checkOppositeSideRequirement(eachitem, lenDic) if isOppMatch: index1 = parseContigName(pair[0], 'R') index2 = parseContigName(pair[1], 'L') oppoPairList.append([index1, index2, pair[2], pair[3]]) if resultOfCk == 'L' or resultOfCk == 'R': index1 = parseContigName(eachitem[-2], resultOfCk) index2 = parseContigName(eachitem[-1], resultOfCk) union(clusterList[index1], clusterList[index2]) if resultOfCk == 'L': if clusterList[index1].terminatingLoc < terminatingLoc[0]: clusterList[index1].terminatingLoc = terminatingLoc[0] if clusterList[index2].terminatingLoc < terminatingLoc[1]: clusterList[index2].terminatingLoc = terminatingLoc[1] elif resultOfCk == 'R': if clusterList[index1].terminatingLoc > terminatingLoc[0]: clusterList[index1].terminatingLoc = terminatingLoc[0] if clusterList[index2].terminatingLoc > terminatingLoc[1]: clusterList[index2].terminatingLoc = terminatingLoc[1] headList = [] for eachitem in clusterList: if find(eachitem) == eachitem: headList.append(eachitem) for eachitem in headList: for eachsub in familyList(eachitem): print parseIDToName(eachsub.id), eachsub.terminatingLoc, print nFamily = len(headList) # Define the match of inInfo vs outInfo [matchList] oppoPairList.sort() for key, items in groupby(oppoPairList, itemgetter(0, 1)): # print parseIDToName(key[0]), parseIDToName(key[1]) find(clusterList[key[0]]).voteList.append(key[1]) find(clusterList[key[1]]).voteList.append(key[0]) matchList = [] for eachitem in headList: if eachitem.id % 2 == 1: successorIndex = eachitem.findSuccessor() if successorIndex != -1: matchList.append([eachitem.id, successorIndex]) repeatList = [] for eachitem in matchList: if eachitem[0] != -1 and eachitem[1] != -1: inList = [] for eachsubitem in familyList(find(clusterList[eachitem[0]])): inList.append([eachsubitem.id, eachsubitem.terminatingLoc]) outList = [] for eachsubitem in familyList(find(clusterList[eachitem[1]])): outList.append([eachsubitem.id, eachsubitem.terminatingLoc]) repeatList.append([inList, outList]) # Filter the embedded contigs globalRemoveList = [] for eachitem in repeatList: inList = eachitem[0] outList = eachitem[1] toRemoveList = [] for aitem in inList: for bitem in outList: if aitem[0] / 2 == bitem[0] / 2 : toRemoveList.append([aitem, bitem]) globalRemoveList.append(aitem[0] / 2) for eachsub in toRemoveList: if eachsub[0] in inList: inList.remove(eachsub[0]) if eachsub[1] in outList: outList.remove(eachsub[1]) print "\nRepeats and in/out contigs" for i in range(len(repeatList)): print "(repeatList[i][0]),(repeatList[i][1]): ", (repeatList[i][0]), (repeatList[i][1]) print "globalRemoveList: ", globalRemoveList print "oppoPairList", oppoPairList # Define the repeat contigDic = commonLib.loadContigsFromFile(folderName, "improved3_Double.fasta") newRepeatList = [] newBRList = [] print "\nRepeat interior and defining flanking region" for eachrepeat in repeatList: # ## Get the initial trial inReadList = [] outReadList = [] for eachitem in eachrepeat[0]: inReadList.append(eachitem[0]) for eachitem in eachrepeat[1]: outReadList.append(eachitem[0]) tmpLink = [] for eachoppoPair in oppoPairList: if eachoppoPair[0] in inReadList and eachoppoPair[1] in outReadList: tmpLink = eachoppoPair break if len(tmpLink) > 0: f1Read, f2Read = tmpLink[0], tmpLink[1] f1 , a1, f2, a2 = -1, tmpLink[2], -1 , tmpLink[3] for eachitem in eachrepeat[0]: if eachitem[0] == f1Read: f1 = eachitem[1] for eachitem in eachrepeat[1]: if eachitem[0] == f2Read: f2 = eachitem[1] print "f1Read, f2Read, f1, a1, f2, a2:\t ", f1Read, f2Read, f1, a1, f2, a2 f1tilde , f2tilde = f1, f2 # ## Refine it for myrecord in dataList: myid = parseContigName(myrecord[-2], 'R') otherid = parseContigName(myrecord[-1], 'R') if myid == f1Read and otherid != myid and otherid in inReadList: if checkSameSideRequirement(myrecord, lenDic): myStart = myrecord[0] if myStart > f1tilde: f1tilde = myStart for myrecord in dataList: myid = parseContigName(myrecord[-2], 'L') otherid = parseContigName(myrecord[-1], 'L') if myid == f2Read and otherid != myid and otherid in outReadList: if checkSameSideRequirement(myrecord, lenDic): myEnd = myrecord[1] if myEnd < f2tilde: f2tilde = myEnd # ## Output the loc indices and read from the real contig to get the repeat out print "f1Read, f2Read, f1tilde, a1, f2tilde, a2: \t", f1Read, f2Read, f1tilde, a1, f2tilde, a2 f1Read_parsed = parseIDToName(f1Read)[0:-2] f2Read_parsed = parseIDToName(f2Read)[0:-2] print "f1Read_parsed, f2Read_parsed", f1Read_parsed, f2Read_parsed, lenDic[f1Read_parsed], lenDic[f2Read_parsed] if a2 < f2tilde: repeatSegment = contigDic[f1Read_parsed][f1tilde:] + contigDic[f2Read_parsed][a2:f2tilde] else: repeatSegment = contigDic[f1Read_parsed][f1tilde:f2tilde - a2] print "len(repeatSegment)", len(repeatSegment) # ## Put to repeat, remove from repeat, add toBR if a2 >= f2tilde or a1 <= f1tilde: newBRList.append([f1Read, f2Read, a1, a2]) tmpSeg = [[], [], repeatSegment] for eachin in eachrepeat[0]: if eachin[0] != f1Read: tmpSeg[0].append(eachin) for eachout in eachrepeat[1]: if eachout[0] != f2Read: tmpSeg[1].append(eachout) if len(tmpSeg[0]) == 1 and len(tmpSeg[1]) == 1: # TODO inIndex = tmpSeg[0][0][0] outIndex = tmpSeg[1][0][0] found = False # if repeat exists , then fill in the blanks # otherwise, fill 0, 0. a1New, a2New = -1, 0 for secondrecord in dataList: isOppMatch, pair = checkOppositeSideRequirement(secondrecord, lenDic) if isOppMatch: index1 = parseContigName(pair[0], 'R') index2 = parseContigName(pair[1], 'L') if index1 == inIndex and index2 == outIndex: oppoPairList.append([index1, index2, pair[2], pair[3]]) if not found: newBRList.append([inIndex, outIndex, -1, 0]) else: newBRList.append([inIndex, outIndex, a1New, a2New]) elif len(tmpSeg[0]) >= 1 and len(tmpSeg[1]) >= 1: newRepeatList.append(tmpSeg) else: if len(eachrepeat[0]) == 1 and len(eachrepeat[1]) == 1: newBRList.append([f1Read, f2Read, a1, a2]) elif len(eachrepeat[0]) >= 1 and len(eachrepeat[1]) >= 1: newRepeatList.append([ repeatSegment, eachrepeat[0], eachrepeat[1]]) # Format output # Rmk: if only 1 copy is left, addToBR; if 0 left, remove that repeat toPhase = newRepeatList toRemove = globalRemoveList toBR = newBRList print "Items to be returned to next step:" print "toPhase", len(toPhase) print "toRemove", len(toRemove) print "toBR", len(toBR) , toBR connectContigs(toPhase, toRemove, toBR, folderName, mummerLink)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec): print "resolvingTandem" ''' Input : repeat info Output : count, join. Algorithm: 1. Find loops 2. Form repeat 3. Form chain of repeat copies back to back 4. Align reads 5. Calculate extra bases beyond flanking region 6. Calculate count 7. Join the contigs ''' # 0 ) Load all the data G = commonLib.seqGraph(0) G.loadFromFile(folderName, contigReadGraph) lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta") N1 = len(lenDicCC) maxDuplicate = 10 repeatTempFilename = "tandemRepeatTemplate.fasta" mummerFile = "myTandemRepeatTemplate" myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta") lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta") header = optTypeFileHeader + "RR" dataListRR = commonLib.extractMumData(folderName, header + "Out") dataListRR = newPhasing.filterData(dataListRR, lenDicRR) dataListRRDic = {} for eachitem in dataListRR: if eachitem[1] > eachitem[3]: dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] header = optTypeFileHeader + "CR" lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta") lenDicCR = dict(lenDicCC.items() + lenDicRR.items()) dataListCR = commonLib.extractMumData(folderName, header + "Out") dataListCR = newPhasing.filterData(dataListCR, lenDicCR) dataListCRDic = {} for eachitem in dataListCR: if eachitem[1] > eachitem[3]: dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4] print dataListCRDic json_data = open(folderName + repeatSpec, 'r') loadData = json.load(json_data) contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta") readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta") happyTandemList = {} for eachrepProfile in loadData: # 1) startContig = eachrepProfile[-1][0][0] isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False) # 2) if isTerminate: v = returnPathList[-1] i =0 tandemPath = [] while i < len(returnPathList): if returnPathList[i] == v: tandemPath = returnPathList[i:] i = len(returnPathList) i = i +1 print returnPathList print tandemPath # 3) [fix it when have time later ; to just use graph; bug at the min thing] repeatContent = "" for kk in range(len(tandemPath[0:-1])): eachitem = tandemPath[kk]- N1 nextitem = tandemPath[kk+1] - N1 readName = "Read" + str(eachitem/2) + "_" nextReadName = "Read" + str(nextitem/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" if nextitem %2 ==0 : nextReadName = nextReadName + "p" elif nextitem %2 ==1: nextReadName = nextReadName + "d" overlap = dataListRRDic[readName + ";" + nextReadName] print overlap repeatContent = repeatContent + myContigsDic[readName][0:-overlap] print "len(repeatContent)", len(repeatContent) fout = open(folderName + repeatTempFilename, 'w') fout.write(">RepeatSegment\n") repeatContentLarge = "" for i in range(maxDuplicate): fout.write(repeatContent) repeatContentLarge= repeatContentLarge + repeatContent fout.close() # 4) repeatReadList = eachrepProfile[1] myList= [] for eachitem in repeatReadList: readName = "Read" + str((eachitem- N1)/2) + "_" if eachitem %2 ==0 : readName = readName + "p" elif eachitem %2 ==1: readName = readName + "d" myList.append(readName) commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList) if True: commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta") dataList = commonLib.extractMumData(folderName, mummerFile+"Out") # 5) totalBasesMatch = 0 lrepeat = len(repeatContent) c = 50 # Important parameters : FIX needed in production #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta") print "dataList[0]", dataList[0] dataList.sort(key = itemgetter(-1)) for key, values in groupby(dataList,itemgetter(-1)): maxValue = -1 for eachsub in values: if eachsub[5] > maxValue: maxValue = eachsub[5] #print key, maxValue totalBasesMatch = totalBasesMatch + maxValue print c, lrepeat, totalBasesMatch ct = totalBasesMatch*1.0/(c*lrepeat) print "BIG NUMBER of THE DAY: ", ct # 6) # a) find the starting point startContig = eachrepProfile[-1][0][0] firstRead = eachrepProfile[-1][0][1]-N1 contigName = "Contig"+ str(startContig/2) if startContig %2 == 0: contigName = contigName + "_p" elif startContig%2 ==1: contigName = contigName + "_d" readName = "Read"+ str(firstRead/2) if firstRead %2 == 0: readName = readName + "_p" elif firstRead%2 ==1: readName = readName + "_d" overlapFirst = dataListCRDic[contigName+";"+readName] tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName] f1 = open(folderName + "firstOverlap.fasta", 'w') f1.write(">combined\n") f1.write(tmpCombine) f1.close() if True: commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta") dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out") dataList.sort(key = itemgetter(0)) maxVal = -1 maxItm = [] for eachi in dataList: if eachi[5] > maxVal: maxVal = eachi[5] maxItm = eachi print maxItm repeatStart = maxItm[0] contigEnd = maxItm[2] # b) format return : prepare the repeat template print "ct*lrepeat", int(repeatStart + ct*lrepeat) print "repeatStart", repeatStart happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)] contigsTmp[contigName] = tmpCombine[0:contigEnd] print "len(contigsTmp[contigName])", len(contigsTmp[contigName]) print "len(happyTandemList[contigName])", len(happyTandemList[contigName]) # 7) Combine all the repeat information and do the join leaderList = [i for i in range(len(contigsTmp))] for eachrepProfile in loadData: startContig = eachrepProfile[-1][0][0] endContig = eachrepProfile[-1][-1][-1] leaderContig = leaderList[startContig] leaderName = parseIDToName(leaderContig) endName = parseIDToName(endContig) startName = parseIDToName(startContig) contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName] if endContig != leaderContig: contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName] contigsTmp[endName] = "" leaderList[endContig] = leaderContig leaderAgg = [[] for i in range(len(leaderList))] for i in range(len(leaderList)): leaderAgg[leaderList[i]].append(i) checkingList = [False for i in range(N1)] fout = open(folderName + "tademResolved.fasta", 'w') counter = 0 for eachcontig in contigsTmp: id = newPhasing.parseEdgeNameToID(eachcontig, 'C') if checkingList[id/2] == False: fout.write(">Segkk"+str(counter)+ "\n") fout.write(contigsTmp[eachcontig]) counter = counter + 1 for eachkk in leaderAgg[leaderList[id]]: checkingList[eachkk/2] = True fout.close()