def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) #print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName): thres = 10 if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel ) dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut") lenDicR = IORobot.obtainLength(folderName, fileR) lenDicQ = IORobot.obtainLength(folderName, fileQ) isRedundantList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] l1, l2 = lenDicR[name1], lenDicQ[name2] if abs(l2 - match2) < thres: isRedundantList.append(name2) # print lenDicQ nonRedundantList = obtainComplement(lenDicQ, isRedundantList) print nonRedundantList IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList) os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta') removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName): thres = 10 os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta > " + folderName + inputFilename + "2.fasta") os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta") if True: alignerRobot.useMummerAlignBatch( mummerLink, folderName, [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]], houseKeeper.globalParallel, ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta") removeList = [] shortEmbedClusterDic = {} for eachitem in lenDic: shortEmbedClusterDic[eachitem] = clusterElem(eachitem) for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2]) nameList = obtainComplement(lenDic, removeList) returnList = [] for eachitem in nameList: if find(shortEmbedClusterDic[eachitem]).id == eachitem: returnList.append(eachitem) print "len(nameList), len(returnList)", len(nameList), len(returnList) IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def alignWithName(leftSeg, rightSeg, folderName, mummerLink, nameOfOut): overlap = [0, 0] lLen = 0 f = open(folderName + nameOfOut + "leftSeg.fasta", 'w') f.write(">SegL\n") if len(leftSeg) < 50000: f.write(leftSeg) lLen = len(leftSeg) else: f.write(leftSeg[-50000:]) lLen = 50000 f.close() rLen = 0 f = open(folderName + nameOfOut + "rightSeg.fasta", 'w') f.write(">SegR\n") if len(rightSeg) < 50000: f.write(rightSeg) rLen = len(rightSeg) else: f.write(rightSeg[0:50000]) rLen = 50000 f.close() #alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False) alignerRobot.useMummerAlign(mummerLink, folderName, nameOfOut, nameOfOut + "leftSeg.fasta", nameOfOut + "rightSeg.fasta", specialForRaw=False, specialName="", refinedVersion=True) dataList = alignerRobot.extractMumData(folderName, nameOfOut + "Out") thres = 10 if len(dataList) == 0: overlap = [0, 0] else: myMax = [0, 0] for eachitem in dataList: if eachitem[1] > lLen - thres and eachitem[2] < thres: if eachitem[5] > myMax[1]: myMax[0] = eachitem[4] myMax[1] = eachitem[5] overlap = myMax return overlap
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ "self", houseKeeper.globalContigName, houseKeeper.globalContigName, "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[ 7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName , mummerLink): print "removeEmbedded" thres = 10 command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName os.system(command) command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName os.system(command) if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName) removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
def removeEmbedded(folderName, mummerLink): print "removeEmbedded" thres = 10 os.system("sed -e 's/|//g' " + folderName + "contigs.fasta > " + folderName + "contigs2.fasta") os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") if True: print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta" alignerRobot.useMummerAlignBatch( mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta") # outputName, referenceName, queryName, specialName print "removeEmbedded: Extracting MUMmer data from delta files to selfOut" dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, "contigs.fasta") removeList = [] for eachitem in dataList: match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8] if name1 != name2: l1, l2 = lenDic[name1], lenDic[name2] if abs(l1 - match1) < thres and abs(l2 - match2) > thres: removeList.append(name1) elif abs(l1 - match1) > thres and abs(l2 - match2) < thres: removeList.append(name2) elif abs(l1 - match1) < thres and abs(l2 - match2) < thres: print "Both shortembedd", eachitem nameList = [] for eachitem in lenDic: nameList.append(eachitem) print len(nameList) for eachitem in removeList: if eachitem in nameList: nameList.remove(eachitem) print len(nameList) print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta" IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def alignWithName(leftSeg, rightSeg, folderName, mummerLink, nameOfOut): overlap = [0, 0 ] lLen = 0 f = open(folderName + nameOfOut+"leftSeg.fasta", 'w') f.write(">SegL\n") if len(leftSeg) < 50000: f.write(leftSeg) lLen = len(leftSeg) else: f.write(leftSeg[-50000:]) lLen = 50000 f.close() rLen = 0 f = open(folderName + nameOfOut+"rightSeg.fasta", 'w') f.write(">SegR\n") if len(rightSeg) < 50000: f.write(rightSeg) rLen = len(rightSeg) else: f.write(rightSeg[0:50000]) rLen = 50000 f.close() #alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False) alignerRobot.useMummerAlign(mummerLink, folderName, nameOfOut, nameOfOut+"leftSeg.fasta", nameOfOut+"rightSeg.fasta", specialForRaw = False, specialName = "", refinedVersion= True) dataList = alignerRobot.extractMumData(folderName , nameOfOut+"Out") thres = 10 if len(dataList) == 0: overlap = [0, 0 ] else: myMax = [0, 0] for eachitem in dataList: if eachitem[1] > lLen - thres and eachitem[2] < thres: if eachitem[5] > myMax[1]: myMax[0] = eachitem[4] myMax[1] = eachitem[5] overlap = myMax return overlap
def align(leftSeg, rightSeg, folderName, mummerLink): overlap = [0, 0] lLen = 0 f = open(folderName + "leftSeg.fasta", "w") f.write(">SegL\n") if len(leftSeg) < 50000: f.write(leftSeg) lLen = len(leftSeg) else: f.write(leftSeg[-50000:]) lLen = 50000 f.close() rLen = 0 f = open(folderName + "rightSeg.fasta", "w") f.write(">SegR\n") if len(rightSeg) < 50000: f.write(rightSeg) rLen = len(rightSeg) else: f.write(rightSeg[0:50000]) rLen = 50000 f.close() alignerRobot.useMummerAlign(mummerLink, folderName, "overlap", "leftSeg.fasta", "rightSeg.fasta", False) dataList = alignerRobot.extractMumData(folderName, "overlapOut") thres = 10 if len(dataList) == 0: overlap = [0, 0] else: myMax = [0, 0] for eachitem in dataList: if eachitem[1] > lLen - thres and eachitem[2] < thres: if eachitem[5] > myMax[1]: myMax[0] = eachitem[4] myMax[1] = eachitem[5] overlap = myMax return overlap
def observeOverlap(folderName): dataList = alignerRobot.extractMumData(folderName, "selfOut") dataList = alignerRobot.transformCoor(dataList) lenDic = IORobot.obtainLength(folderName, 'contigs.fasta') matchThres = 10000 nonMatchThres = 500 count = 0 newDataList = [] for eachitem in dataList: name1, name2 = eachitem[-2], eachitem[-1] matchLen1, matchLen2 = eachitem[4], eachitem[5] start1, end1, start2, end2 = eachitem[0], eachitem[1], eachitem[ 2], eachitem[3] # if name1!= name2 and min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \ # and min(start1, start2) > nonMatchThres \ if name1!= name2 and ( min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \ or min(start1, start2) > nonMatchThres ) \ and matchLen1> matchThres: print "eachitem ", eachitem, lenDic[name1], lenDic[name2] count = count + 1 newDataList.append(eachitem) print "Count: " + str(count) blkDic = getBreakPointFromDataList(folderName, newDataList) LCList = IORobot.loadContigsFromFile(folderName, "contigs.fasta") contigList = [] for eachcontig in LCList: #print eachcontig if not eachcontig in blkDic: contigList = contigList + [LCList[eachcontig]] else: contigList = contigList + tmpBreakAcBkPts(LCList[eachcontig], blkDic[eachcontig]) print "len(contigList)", len(contigList) IORobot.writeSegOut(contigList, folderName, "breakChains.fasta")
def fillInMissed(folderName, mummerLink, filerefname, filequeryname, fileoutname): os.system("mv " + folderName + fileoutname + " " + folderName + filequeryname ) alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[fileoutname+"fillmiss", filerefname, filequeryname, ""]], houseKeeper.globalParallel) dataList = alignerRobot.extractMumData(folderName, fileoutname+"fillmissOut") lenDic = obtainLength(folderName, filerefname) ### Check if there is any missing parts # Format of the dataList : 1 765 | 11596 10822 | 765 775 | 84.25 | ref_NC_001133_ scf7180000000702" dataList.sort(key = itemgetter(-2)) thres = 100 extraList = [] for key, items in groupby(dataList, itemgetter(-2)): isFound = False for eachitem in items: if abs(int(eachitem[4]) - lenDic[key]) < thres: isFound = True break if not isFound: extraList.append(key) ### Fill in any missing items referenceDic = loadContigsFromFile(folderName, filerefname) queryDic = loadContigsFromFile(folderName, filequeryname) ctgList = [referenceDic[eachitem] for eachitem in extraList] + [queryDic[eachitem] for eachitem in queryDic] writeSegOut(ctgList, folderName, fileoutname) print "fileoutname: len(extraList)",fileoutname, len(extraList), len(ctgList)
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile): thres = 5 minLen = 400 # thres = 10 # minLen = 200 writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig") fmyFile = open(folderName + inputFile + "_Double.fasta", 'r') fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w') tmp = fmyFile.readline().rstrip() maxSize = 50000 myName = "" while len(tmp) > 0: if tmp[0] == '>': fSmaller.write(tmp + '\n') myName = tmp[1:] else: component = tmp[0:min(len(tmp), maxSize)] countComp = len(component) fSmaller.write(component) component = tmp[max(0, len(tmp) - maxSize):len(tmp)] fSmaller.write(component) countComp = countComp + len(component) print "DebugName", myName, countComp fSmaller.write('\n') tmp = fmyFile.readline().rstrip() fSmaller.close() fmyFile.close() if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[ mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", "" ]], houseKeeper.globalParallel) # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta") lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out") # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName] dataSet = [] for eachitem in dataSetRaw: helperStart, helperEnd, readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem detailHelper = helperName.split('_') detailRead = readName.split('_') if detailHelper[0] != detailRead[0] and helperName != readName and max( matchLen1, matchLen2) > minLen and readStart < readEnd and min( helperStart, readStart) < thres and min( lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres: conditionForMatch = True else: conditionForMatch = False if conditionForMatch: if helperStart < thres: dataSet.append((max(matchLen1, matchLen2), readName, helperName)) dataSet.sort(reverse=True) numberOfContig = len(lengthDic) return numberOfContig, dataSet
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile): # minLen = 200 minLen = 400 if houseKeeper.globalRelaxThres == False: thres = 5 elif houseKeeper.globalRelaxThres == True: thres = 10 writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig") fmyFile = open(folderName + inputFile + "_Double.fasta", 'r') fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w') tmp = fmyFile.readline().rstrip() maxSize = 50000 myName = "" while len(tmp) > 0: if tmp[0] == '>': fSmaller.write(tmp + '\n') myName = tmp[1:] else: component = tmp[0:min(len(tmp), maxSize)] countComp = len(component) fSmaller.write(component) component = tmp[max(0, len(tmp) - maxSize):len(tmp)] fSmaller.write(component) countComp = countComp + len(component) print "DebugName", myName, countComp fSmaller.write('\n') tmp = fmyFile.readline().rstrip() fSmaller.close() fmyFile.close() if True: alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", ""]], houseKeeper.globalParallel ) # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta") lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out") # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName] dataSet = [] for eachitem in dataSetRaw: helperStart, helperEnd , readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem detailHelper = helperName.split('_') detailRead = readName.split('_') if detailHelper[0] != detailRead[0] and helperName != readName and max(matchLen1, matchLen2) > minLen and readStart < readEnd and min(helperStart, readStart) < thres and min(lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres: conditionForMatch = True else: conditionForMatch = False if conditionForMatch : if helperStart < thres: dataSet.append((max(matchLen1, matchLen2), readName, helperName)) dataSet.sort(reverse=True) numberOfContig = len(lengthDic) return numberOfContig, dataSet