def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) #print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if not os.path.isfile(folderName + "selfOut"): alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres) nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel ) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) # print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta') removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]], houseKeeper.globalParallel, ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta") removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ "self", houseKeeper.globalContigName, houseKeeper.globalContigName, "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, "contigs.fasta") removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def fillInMissed(folderName, mummerLink, filerefname, filequeryname, fileoutname): os.system("mv " + folderName + fileoutname + " " + folderName + filequeryname ) alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[fileoutname+"fillmiss", filerefname, filequeryname, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, fileoutname+"fillmissOut") lenDic = obtainLength(folderName, filerefname) ### Check if there is any missing parts # Format of the dataList : 1 765 | 11596 10822 | 765 775 | 84.25 | ref_NC_001133_ scf7180000000702" dataList.sort(key = itemgetter(-2)) thres = 100 extraList = [] for key, items in groupby(dataList, itemgetter(-2)): isFound = False for eachitem in items: if abs(int(eachitem[4]) - lenDic[key]) < thres: isFound = True break if not isFound: extraList.append(key) ### Fill in any missing items referenceDic = loadContigsFromFile(folderName, filerefname) queryDic = loadContigsFromFile(folderName, filequeryname) ctgList = [referenceDic[eachitem] for eachitem in extraList] + [queryDic[eachitem] for eachitem in queryDic] writeSegOut(ctgList, folderName, fileoutname) print "fileoutname: len(extraList)",fileoutname, len(extraList), len(ctgList)
def formRelatedReadsFile(folderName, mummerLink): # Find associated read and extract into a file associatedReads.fasta # Input: contigs.fasta, cleaned_Reads.fasta # Output: relatedReads.fasta # ## Extract heads of the contigs print ">formRelatedReadsFile" f = open(folderName + "improved.fasta", 'r') f2 = open(folderName + "improvedTrunc.fasta", 'w') temp = f.readline() tempContig = "" thres = 400 runningIndex = 0 endThres = 10 while len(temp) > 0: if temp[-1] == '\n': temp = temp[0:-1] if temp[0] == '>': if len(tempContig) > 0: IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 tempContig = "" else: tempContig = tempContig + temp temp = f.readline() IORobot.writeToFile(f2, runningIndex, tempContig[0:thres]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, tempContig[-thres:]) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres])) runningIndex = runningIndex + 1 IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:])) runningIndex = runningIndex + 1 f2.close() f.close() # ## Write double stranded reads IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig") # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read") # ## Apply MUMMER on them using cleanedReads against them assoiatedReadIndex = [] nameList = [] numberOfFiles = max(20, houseKeeper.globalParallel) if True: bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True) # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum ) ''' command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta" os.system(command) command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum os.system(command) ''' for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) f = open(folderName + "fromMum" + indexOfMum, 'r') for i in range(6): tmp = f.readline() while len(tmp) > 0: infoArr = tmp.split('|') myArr = infoArr[-1].split('\t') rdGpArr = infoArr[-1].split('\t') contigName = rdGpArr[0].rstrip().lstrip() readName = rdGpArr[1].rstrip().lstrip() endSegArr = infoArr[0].split(" ") pos = [] for eachitem in endSegArr: if len(eachitem) > 0: pos.append(int(eachitem)) startPos = pos[0] endPos = pos[1] if startPos < endThres and endPos > thres - endThres: assoiatedReadIndex.append(myArr[1]) nameList.append([int(contigName.split('_')[1]), readName]) tmp = f.readline() f.close() nameList.sort() assoiatedReadIndex.sort() # print "assoiatedReadIndex", assoiatedReadIndex ckIndex = 0 f = open(folderName + "associatedNames.txt", 'w') oneItem = 0 keyFound = [] for key, items in groupby(assoiatedReadIndex): countItem = 0 for eachitem in items: countItem += 1 if countItem == 1: oneItem += 1 else: key = key.rstrip() if not key in keyFound: f.write(key + '\n') keyFound.append(key) ckIndex += 1 print "ckIndex,oneItem: ", ckIndex, oneItem f.close() fFilter = open(folderName + "associatedNames.txt", 'r') fout = open(folderName + "associatedNames2.txt", 'w') maxCount = 12000 mytmpDum = fFilter.readline() i = 0 while i < maxCount and len(mytmpDum) > 0: fout.write(mytmpDum) mytmpDum = fFilter.readline() i = i + 1 fout.close() fFilter.close() command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta" os.system(command) IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
def extractEdgeSet(folderName, mummerLink, option="nopolish"): # Tasks: reconstruct the string graph # Input : relatedReads_Double.fasta, conig_Double.fasta # Intermediate files: fromMum_overlap , fromMum_overlap # Output: connectivity of eachNode: InList, OutList [critical] # connectivity of eachNode: arrow representation with size [optional] # ## Perform MUMMER alignment print ">Extract Edge set" contigOnlyLengthDic = IORobot.obtainLength(folderName, "improved.fasta") # print lengthDic lengthDic = IORobot.findContigLength(folderName, "improved") numberOfContig = len(contigOnlyLengthDic)*2 K = 400 thres = 5 # ## Apply MUMMER on them using cleanedReads against them IORobot.truncateEndOfContigs(folderName, "improved_Double.fasta", "smaller_improvedContig.fasta", 25000, lengthDic) dataSet = [] numberOfFiles = max(20, houseKeeper.globalParallel) if True: bindir = os.path.abspath(os.path.dirname(sys.argv[0])) command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "relatedReads_Double.fasta" os.system(command) workerList = [] for dummyI in range(1, numberOfFiles + 1): indexOfMum = "" if dummyI < 10: indexOfMum = "0" + str(dummyI) else: indexOfMum = str(dummyI) outputName, referenceName, queryName, specialName= "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", "fromMumRefine" + indexOfMum workerList.append([outputName, referenceName, queryName, specialName]) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True) # alignerRobot.useMummerAlign(mummerLink, folderName, "outRefine", "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", True, "fromMumRefine" + indexOfMum) for dummyI in range(1, numberOfFiles + 1): tmpSet = IORobot.obtainLinkInfoReadContig(dummyI, mummerLink, folderName,thres, lengthDic, K) dataSet = dataSet + tmpSet # ## repeat aware usableJunction = loadOpenList(folderName) dataSet, blockedSet = filterRepeatEnd(dataSet, usableJunction) # ## repeat aware end dataSet.sort() matchPair = formMatchPairFromReadInfo(dataSet) # Bug fix on repeat detection from reads alone matchPair = filterRepeatPair(matchPair) # end bug fix # print matchPair bestMatchPair = [] for key, items in groupby(matchPair, itemgetter(0, 1)): maxvalue = -1 maxLenPair = [] for eachitem in items: if eachitem[2] > maxvalue: maxvalue = eachitem[2] maxLenPair = [eachitem[3], eachitem[4], eachitem[5]] bestMatchPair.append([key[0], key[1], maxvalue, maxLenPair[0], maxLenPair[1], maxLenPair[2]]) contigList, leftConnect, rightConnect, rawReadList = formbestpair(bestMatchPair,numberOfContig) print "contigList", contigList writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink)
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile): thres = 5 minLen = 400 # thres = 10 # minLen = 200 writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig") fmyFile = open(folderName + inputFile + "_Double.fasta", 'r') fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w') tmp = fmyFile.readline().rstrip() maxSize = 50000 myName = "" while len(tmp) > 0: if tmp[0] == '>': fSmaller.write(tmp + '\n') myName = tmp[1:] else: component = tmp[0:min(len(tmp), maxSize)] countComp = len(component) fSmaller.write(component) component = tmp[max(0, len(tmp) - maxSize):len(tmp)] fSmaller.write(component) countComp = countComp + len(component) print "DebugName", myName, countComp fSmaller.write('\n') tmp = fmyFile.readline().rstrip() fSmaller.close() fmyFile.close() if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta") lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out") # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName] dataSet = [] for eachitem in dataSetRaw: helperStart, helperEnd, readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem detailHelper = helperName.split('_') detailRead = readName.split('_') if detailHelper[0] != detailRead[0] and helperName != readName and max( matchLen1, matchLen2) > minLen and readStart < readEnd and min( helperStart, readStart) < thres and min( lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres: conditionForMatch = True else: conditionForMatch = False if conditionForMatch: if helperStart < thres: dataSet.append((max(matchLen1, matchLen2), readName, helperName)) dataSet.sort(reverse=True) numberOfContig = len(lengthDic) return numberOfContig, dataSet
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile): # minLen = 200 minLen = 400 if houseKeeper.globalRelaxThres == False: thres = 5 elif houseKeeper.globalRelaxThres == True: thres = 10 writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig") fmyFile = open(folderName + inputFile + "_Double.fasta", 'r') fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w') tmp = fmyFile.readline().rstrip() maxSize = 50000 myName = "" while len(tmp) > 0: if tmp[0] == '>': fSmaller.write(tmp + '\n') myName = tmp[1:] else: component = tmp[0:min(len(tmp), maxSize)] countComp = len(component) fSmaller.write(component) component = tmp[max(0, len(tmp) - maxSize):len(tmp)] fSmaller.write(component) countComp = countComp + len(component) print "DebugName", myName, countComp fSmaller.write('\n') tmp = fmyFile.readline().rstrip() fSmaller.close() fmyFile.close() if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta") lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out") # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName] dataSet = [] for eachitem in dataSetRaw: helperStart, helperEnd , readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem detailHelper = helperName.split('_') detailRead = readName.split('_') if detailHelper[0] != detailRead[0] and helperName != readName and max(matchLen1, matchLen2) > minLen and readStart < readEnd and min(helperStart, readStart) < thres and min(lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres: conditionForMatch = True else: conditionForMatch = False if conditionForMatch : if helperStart < thres: dataSet.append((max(matchLen1, matchLen2), readName, helperName)) dataSet.sort(reverse=True) numberOfContig = len(lengthDic) return numberOfContig, dataSet