def agouti_path_main(agoutiPaths, dSenses, vertex2Name, dGFFs, dCtgPair2GenePair, oriScafPathFile, outDir, prefix): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_path") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) agPathProgress = agLOG.PROGRESS_METER(moduleName) agPathProgress.logger.info("Analyzing scaffolding paths") outDebugFile = os.path.join(moduleOutDir, prefix) + ".agouti_path.debug" agPathDebug = agLOG.DEBUG("SHREDDER", outDebugFile) agPathProgress.logger.info("[BEGIN] Reading file with shred info") dOriPaths, dOriGaps = read_original_path(oriScafPathFile, agPathProgress) agPathProgress.logger.info("[DONE]") # shut it off for now; working to improve it #agPathProgress.logger.info("[BEGIN] Checking consistency") #compare(dOriPaths, agoutiPaths, vertex2Name, outDir, prefix) #agPathProgress.logger.info("[DONE]") report_consistency(agoutiPaths, dOriPaths, vertex2Name, outDir, prefix) agPathProgress.logger.info("[BEGIN] Recovring original scaffolding") agoutiPaths, dCtgPair2GenePair, dSenses = recover_untouched_sequences( dOriPaths, agoutiPaths, vertex2Name, dGFFs, dCtgPair2GenePair, dSenses, agPathProgress, agPathDebug) agPathProgress.logger.info("[DONE]") return agoutiPaths, dCtgPair2GenePair, dSenses
def agouti_update(agoutiPaths, dSeqs, vertex2Name, dSenses, dGFFs, dCtgPair2GenePair, outDir, prefix, nFills=1000, debug=0, no_update_gff=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_update") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join(moduleOutDir, "%s.agouti_update.progressMeter" % (prefix)) global agUPDATEProgress agUPDATEProgress = agLOG.PROGRESS_METER(moduleName) agUPDATEProgress.add_file_handler(progressLogFile) if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_update.debug" % (prefix)) global agUPDATEDebug agUPDATEDebug = agLOG.DEBUG(moduleName, debugLogFile) if not no_update_gff: agUPDATEProgress.logger.info("[BEGIN] Updating gene models") outFasta = os.path.join(outDir, "%s.agouti.fasta" % (prefix)) fFASTA = open(outFasta, 'w') dUpdateGFFs = collections.defaultdict(list) dMergedGene2Ctgs = collections.defaultdict(list) dMergedGene2Genes = collections.defaultdict(list) scafPaths = [] numMergedGene = 0 nCtgScaffolded = 0 scaffoldedCtgs = {} seqLens = [] dScafGaps = {} dScafStats = {} scafID = 0 mergedGenes = [] for i in range(len(agoutiPaths)): path = agoutiPaths[i] scafID += 1 scafName = prefix + "_scaf_%d" % (scafID) dScafStats[scafName] = 0 dScafGaps[scafName] = [] curVertex = path[0] sequence = dSeqs[curVertex] curSense = "+" curCtg = vertex2Name[curVertex] preCtg = "" scafPath = [curVertex] preGeneID, curGeneID = "", "" mergedGene = agGFF.AGOUTI_GFF() preMergedGene = agGFF.AGOUTI_GFF() gapStart, gapStop = 0, 0 offset = 0 orientation = "" updatedGeneIDs = [] mergedGenesPerPath = [] excludeGeneIDs = [] for nextVertex in path[1:]: nextCtg = vertex2Name[nextVertex] if preCtg == "": if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t>scaf_%d - path - %s" % (scafID, str([vertex2Name[vertex] for vertex in path]))) if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tcurVertex - %d - %s - nextVertex - %d - %s" % (curVertex, curCtg, nextVertex, nextCtg)) if not no_update_gff: #curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curCtg, nextCtg) curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curVertex, nextVertex) #!!! I should not break here, should continue# if curGene is None and nextGene is None: agUPDATEProgress.logger.error( "%s - %s found no gene models joining them" % (curCtg, nextCtg)) agUPDATEProgress.logger.error( "This is NOT EXPECTED, REPORT!") sys.exit(1) curGeneID = curGene.geneID excludeGeneIDs = [preGeneID] + [curGeneID] if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tpreGene - %s - curGene - %s - nextGene - %s" % (preGeneID, curGene.geneID, nextGene.geneID)) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafName - %s" % (scafName)) FR, FF, RR, RF = get_orientation_counts(curVertex, nextVertex, dSenses) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" % (curSense)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" % (FR, FF, RF, RR)) if curSense == "-": temp1 = FR temp2 = FF FR = RR FF = RF RR = temp1 RF = temp2 orientation = decide_orientation(FR, FF, RR, RF) gapStart = gapStop + len(dSeqs[curVertex]) gapStop = gapStart + nFills - 1 dScafGaps[scafName].append((gapStart + 1, gapStop + 1)) if debug: agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" % (curSense)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" % (FR, FF, RF, RR)) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\toffset - %d - curCtgLen - %d" % (offset, len(dSeqs[curVertex]))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tgapstart - %d - gapstop - %d" % (gapStart, gapStop + 1)) valid = 0 if orientation == "FR": if not no_update_gff: if curGeneID != preGeneID: numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] #if curGene.geneStop != 0: # dMergedGene2Genes[mergedGene.geneID] += [curGeneID] #if nextGene.geneStop != 0: # dMergedGene2Genes[mergedGene.geneID] += [nextGene.geneID] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] if nextGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence += 'N' * nFills + dSeqs[nextVertex] scafPath += [nextVertex] curSense = "+" elif orientation == "FF": if not no_update_gff: #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug) dGFFs[nextCtg] = reverse_gene_models( dGFFs[nextCtg], len(dSeqs[nextVertex]), debug) if curGeneID != preGeneID: numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence += 'N' * nFills + agSeq.rc_seq(dSeqs[nextVertex]) scafPath += [-1 * nextVertex] curSense = "-" elif orientation == "RR": if not no_update_gff: if curGene.geneID != preGeneID: dGFFs[curCtg] = reverse_gene_models( dGFFs[curCtg], len(dSeqs[curVertex]), debug) numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) dUpdateGFFs[scafName] = reverse_gene_models( dUpdateGFFs[scafName], gapStart - 1, debug) mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence = agSeq.rc_seq(sequence) + \ 'N'*nFills + dSeqs[nextVertex] scafPath[-1] = -1 * scafPath[-1] scafPath += [nextVertex] curSense = "+" elif orientation == "RF": if not no_update_gff: dGFFs[nextCtg] = reverse_gene_models( dGFFs[nextCtg], len(dSeqs[nextVertex]), debug) if curGene.geneID != preGeneID: dGFFs[curCtg] = reverse_gene_models( dGFFs[curCtg], len(dSeqs[curVertex]), debug) #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug) numMergedGene += 1 mergedGene = merge_gene_model(curGene, nextGene, scafName, numMergedGene, offset, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [ curCtg, nextCtg ] if mergedGene.geneStop != 0: dMergedGene2Genes[mergedGene.geneID] += [ curGeneID, nextGene.geneID ] dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug, mergedGene) else: dUpdateGFFs[ scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) dUpdateGFFs[scafName] = reverse_gene_models( dUpdateGFFs[scafName], gapStop + len(dSeqs[curVertex]), debug) mergedGene = merge_gene_model(preMergedGene, nextGene, scafName, numMergedGene, 0, gapStop, debug) dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg] dMergedGene2Genes[mergedGene.geneID] += [ nextGene.geneID ] indexMerged = updatedGeneIDs.index(mergedGene.geneID) dUpdateGFFs[scafName][indexMerged] = mergedGene preMergedGene = mergedGene sequence = agSeq.rc_seq(sequence) + \ 'N'*nFills + \ agSeq.rc_seq(dSeqs[nextVertex]) scafPath[-1] = -1 * scafPath[-1] scafPath += [-1 * nextVertex] curSense = "-" if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tscafPath in vertices updates- %s" % (str(scafPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tdMergedGene2Gene - %s" % (str(dMergedGene2Genes[mergedGene.geneID]))) if not no_update_gff: mergedGenesPerPath.append(mergedGene.geneID) preGeneID = nextGene.geneID offset = gapStop preCtg = curCtg curVertex = nextVertex curCtg = vertex2Name[curVertex] for i in range(len(scafPath)): v = scafPath[i] if v < 0: scafPath[i] = "-" + vertex2Name[-1 * v] else: scafPath[i] = vertex2Name[v] scafPaths += [scafPath] if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tscafPath in human-readable updates- %s" % (str(scafPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tappend last curCtg - %s" % (curCtg)) agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafPath - %s" % (str(scafPath))) if not no_update_gff: excludeGeneIDs = [preGeneID] mergedGenes.append(mergedGenesPerPath) dUpdateGFFs[scafName], updatedGeneIDs = update_gene_model( dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset, excludeGeneIDs, debug) fFASTA.write(">%s |%dbp |%s\n%s\n" % (scafName, len(sequence), ",".join(scafPath), sequence)) dScafStats[scafName] = len(sequence) seqLens.append(len(sequence)) #agPaths.append(scafPath) nCtgScaffolded += len(scafPath) scaffoldedCtgs.update(dict((contig, 1) for contig in scafPath)) if debug: agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t\tmergedGenesPerPath - %s" % (str(mergedGenesPerPath))) agUPDATEDebug.debugger.debug( "UPDATE_MAIN\t-------------------------------------") agPATH.report_scaffold_path(scafPaths, vertex2Name, outDir, prefix) # other contigs need to be output agUPDATEProgress.logger.info("Finalizing sequences") for vertex in dSeqs: if vertex2Name[vertex] in scaffoldedCtgs or "-" + vertex2Name[ vertex] in scaffoldedCtgs: continue fFASTA.write(">%s\n%s\n" % (vertex2Name[vertex], dSeqs[vertex])) dScafStats[vertex2Name[vertex]] = len(dSeqs[vertex]) seqLens.append(len(dSeqs[vertex])) fFASTA.close() n50 = agSeq.get_assembly_NXX(seqLens) agUPDATEProgress.logger.info("Outputting updated Gene Moddels") for vertex in dSeqs: if vertex2Name[vertex] in scaffoldedCtgs: if vertex2Name[vertex] in dGFFs: del dGFFs[vertex2Name[vertex]] if not no_update_gff: dFinalGFFs = dict(dGFFs, **dUpdateGFFs) numGenes = output_gff(dFinalGFFs, dMergedGene2Ctgs, dMergedGene2Genes, dScafStats, dScafGaps, outDir, prefix) agUPDATEProgress.logger.info("Summarizing AGOUTI gene paths") summarize_gene_path(dMergedGene2Genes, dMergedGene2Ctgs, outDir, prefix) agUPDATEProgress.logger.info("-----------Summary-----------") agUPDATEProgress.logger.info("number of contigs scaffoled: %d" % (nCtgScaffolded)) agUPDATEProgress.logger.info("number of scaffolds: %d" % (scafID)) agUPDATEProgress.logger.info( "number of contigs in the final assembly: %d" % (len(seqLens))) agUPDATEProgress.logger.info("Final assembly N50: %d" % (n50)) if not no_update_gff: agUPDATEProgress.logger.info("Final number of genes: %d" % (numGenes)) agUPDATEProgress.logger.info("Succeeded")
def denoise_joining_pairs(dContigPairs, dGFFs, vertex2Name, outDir, prefix, minSupport, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_denoise") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_denoise.progressMeter" % (prefix)) agDENOISEProgress = agLOG.PROGRESS_METER(moduleName) agDENOISEProgress.add_file_handler(progressLogFile) debugLogFile = "" if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_denoise.debug" % (prefix)) global agDENOISEDebug agDENOISEDebug = agLOG.DEBUG(moduleName, debugLogFile) agDENOISEProgress.logger.info("[BEGIN] Denoising joining pairs") startTime = time.clock() dCtgPair2GenePair = collections.defaultdict() dCtgPairDenoise = collections.defaultdict() dMappedPos = collections.defaultdict() daddedModels = collections.defaultdict(list) nFail4Combination = 0 nFailGeneModel = 0 nFailK = 0 outDenoiseJPFile = os.path.join( moduleOutDir, "%s.agouti.join_pairs.noise_free.txt" % (prefix)) fOUT = open(outDenoiseJPFile, 'w') for ctgPair, pairInfo in dContigPairs.items(): if len(pairInfo) < minSupport: nFailK += 1 del dContigPairs[ctgPair] continue ctgA = ctgPair[0] ctgB = ctgPair[1] if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t>contigA - %s - contigB - %s" % (ctgA, ctgB)) pairToRemove = [] mapIntervalsA = [] mapIntervalsB = [] pairs = [] senses = [] keep = 0 for i in xrange(len(pairInfo)): startA, startB, stopA, stopB, senseA, senseB, readID = pairInfo[i] mapIntervalsA += [(startA, stopA)] mapIntervalsB += [(startB, stopB)] pairs += [(startA, stopA, startB, stopB)] senses += [(senseA, senseB)] genePair = get_genePair_for_contigPair(dGFFs, ctgA, ctgB, mapIntervalsA, mapIntervalsB, senses, debug) geneModelsA = dGFFs[ctgA] geneModelsB = dGFFs[ctgB] if genePair is None: nFailGeneModel += 1 if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tFail to find a pair of gene models") agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t----------------------------------") else: geneIndexA, geneIndexB, endA, endB, intervalsA, intervalsB, senses = genePair sensesCounter = collections.Counter(senses) if debug: agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tsensesCounter: %s" % (str(sensesCounter))) if geneIndexB != 0: # create gene model according to endB using intervalsB if geneIndexB == -1 and (endB == 5 or endB == 0): dGFFs[ctgB] = create_fake_genes(geneModelsB, 0, ctgB, intervalsB, debug) geneIndexB = 0 endB = 5 elif geneIndexB == 1 and (endB == 3 or endB == 0): dGFFs[ctgB] = create_fake_genes(geneModelsB, len(geneModelsB), ctgB, intervalsB, debug) geneIndexB = len(dGFFs[ctgB]) - 1 endB = 3 else: if endB == 0: endB = 5 elif endB == 3: geneIndexB = len(dGFFs[ctgB]) - 1 if geneIndexA != 0: # create gene model according to endA using intervalsA if geneIndexA == -1 and (endA == 5 or endA == 0): dGFFs[ctgA] = create_fake_genes(geneModelsA, 0, ctgA, intervalsA, debug) geneIndexA = 0 endA = 5 elif geneIndexA == 1 and (endA == 3 or endA == 0): dGFFs[ctgA] = create_fake_genes(geneModelsA, len(geneModelsA), ctgA, intervalsA, debug) geneIndexA = len(dGFFs[ctgA]) - 1 endA = 3 else: if endA == 0: endA = 3 elif endA == 3: geneIndexA = len(dGFFs[ctgA]) - 1 if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tgenePair: %s" % (str(genePair))) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t# models on ctgA - %d - # models on ctgB - %d" % (len(dGFFs[ctgA]), len(dGFFs[ctgB]))) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneIndexA - %d - endA - %d - geneIndexB - %d - endB - %d" % (geneIndexA, endA, geneIndexB, endB)) sense = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0][0] if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tsensePair - %s" % (str(sense))) if (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \ (geneIndexB == 0 and endB == 5) and sense == ('+', '-'): # FR + 3'-5' keep = 1 elif (geneIndexA == 0 and endA == 5) and \ (geneIndexB == 0 and endB == 5) and sense == ('-', '-'): # RR + 5'-5' keep = 1 elif (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \ (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \ sense == ('+', '+'): # FF + 3'-3' keep = 1 elif (geneIndexA == 0 and endA == 5) and \ (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \ sense == ('-', '+'): # RF + 5'-3' keep = 1 elif (geneIndexA == 0 and (endA == 0 or endA == 3)) and \ (geneIndexB == 0 and (endB == 0 or endB == 5)) and \ sense == ('+', '-'): # only one gene on the contig # it doesn't matter which end keep = 1 if keep: geneA = dGFFs[ctgA][geneIndexA] geneB = dGFFs[ctgB][geneIndexB] dCtgPair2GenePair[vertex2Name.index(ctgA), vertex2Name.index(ctgB)] = [geneA, geneB] if debug: agDENOISEDebug.debugger.debug("DENOISE_MAIN\tNOISE-FREE") agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneA ID - %s - startA - %d - stopA = %d" % (geneA.geneID, geneA.geneStart, geneA.geneStop)) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\tgeneB ID - %s - startB - %d - stopB = %d" % (geneB.geneID, geneB.geneStart, geneB.geneStop)) agDENOISEDebug.debugger.debug( "DENOISE_MAIN\t----------------------------------") senseA = sense[0] senseB = sense[1] weight = 0 for i in xrange(len(pairInfo)): startA, startB, stopA, stopB, _, _, readID = pairInfo[i] intervalA = (startA, stopA) intervalB = (startB, stopB) #print "intervalA", intervalA, "intervalB", intervalB if len(intervalsA) == 0: if len(intervalsB) == 0: #print "use all" fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: #print "use all A, not all B" overlap = find_overlap( intervalB, (geneB.geneStart, geneB.geneStop)) if overlap == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: if len(intervalsB) == 0: #print "use all B, not all A" overlap = find_overlap( intervalA, (geneA.geneStart, geneA.geneStop)) if overlap == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 else: #print "not all Both" overlapA = find_overlap( intervalA, (geneA.geneStart, geneA.geneStop)) overlapB = find_overlap( intervalB, (geneB.geneStart, geneB.geneStop)) if overlapA == 0 and overlapB == 0: fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (readID, ctgA, startA, senseA, ctgB, startB, senseB)) weight += 1 dCtgPairDenoise[vertex2Name.index(ctgA), vertex2Name.index(ctgB)] = [ weight, (senseA, senseB) ] else: nFail4Combination += 1 # if len(sensesCounter) == 1: # sense = sensesCounter.keys()[0] # else: # print "multiple sense pairs" # senses = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0:2] # print "senses", senses # ratio = float(senses[0][1])/(senses[0][1]+senses[1][1]) # print "ratio", ratio fOUT.close() agDENOISEProgress.logger.info("Succeeded") agDENOISEProgress.logger.info("Denoise took in %.2f min CPU time" % ((time.clock() - startTime) / 60)) agDENOISEProgress.logger.info( "%d contig pairs filtered for spanning across >1 gene models" % (nFailGeneModel)) agDENOISEProgress.logger.info( "%d contig pairs filtered for not being one of the four combinations" % (nFail4Combination)) agDENOISEProgress.logger.info("%d contig pairs filtered for less support" % (nFailK)) agDENOISEProgress.logger.info("%d contig pairs for scaffolding" % (len(dCtgPairDenoise))) return dCtgPair2GenePair, dCtgPairDenoise
def get_joining_pairs(bamStream, outDir, prefix, overwrite, minMapQ=5, minFracOvl=0.0, maxFracMismatch=1.0, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_join_pairs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix)) agBAMOutAllJoinPairs = os.path.join( moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix)) agBAMProgress = agLOG.PROGRESS_METER(moduleName) if not os.path.exists(progressLogFile): agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") else: if not overwrite: agBAMProgress.add_file_handler(progressLogFile, 'a') dContigPairs = retrieve_joininng_pairs(agBAMProgress, agBAMOutAllJoinPairs) if dContigPairs is not None: return dContigPairs else: agBAMProgress.logger.info( "Fail to pick up results from the previous run") agBAMProgress.logger.info("Re-processing the BAM file") else: agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") agBAMProgress.logger.info( "Overwrite results from the previous run") agBAMDebug = None if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_join_pairs.debug" % (prefix)) agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile) with open(agBAMOutAllJoinPairs, 'w') as fOUT: agBAMProgress.logger.info( "# processed\t| Current Reads ID\t| Elapsed Time") if debug: agBAMDebug.debugger.debug( "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB" ) startTime = time.time() dContigPairs = collections.defaultdict(list) nJoinPairs = 0 nReadsPairs = 0 while True: pairA = bamStream.readline().strip().split("\t") pairB = bamStream.readline().strip().split("\t") # reach the end of the file if len(pairA) == 1 or len(pairB) == 1: break readsID = pairA[0] contigA = pairA[2] contigB = pairB[2] nReadsPairs += 1 if pairA[0] == pairB[0] and contigA != contigB: alnLenA = getCIGAR(pairA[5]) alnLenB = getCIGAR(pairB[5]) leftMostPosA = int(pairA[3]) leftMostPosB = int(pairB[3]) readLenA = len(pairA[9]) readLenB = len(pairB[9]) nMismatchesA = getMismatches(pairA[11:]) nMismatchesB = getMismatches(pairB[11:]) mapQA = int(pairA[4]) mapQB = int(pairB[4]) flagsA = explainSAMFlag(int(pairA[1])) flagsB = explainSAMFlag(int(pairB[1])) senseA = flagsA[4] senseB = flagsB[4] if debug: agBAMDebug.debugger.debug( "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d" % (readsID, contigA + ":" + str(leftMostPosA), contigB + ":" + str(leftMostPosB), mapQA, mapQB, senseA, senseB, readLenA, readLenB)) if (min(alnLenA / readLenA, alnLenB / readLenB) >= minFracOvl and # minimum fraction of overlaps max(nMismatchesA / alnLenA, nMismatchesB / alnLenB) <= maxFracMismatch and # maximum fraction of mismatches min(mapQA, mapQB) >= minMapQ): # minimum mapping quality startA = leftMostPosA + 1 stopA = startA + 1 + int(alnLenA) startB = leftMostPosB + 1 stopB = startB + 1 + int(alnLenB) nJoinPairs += 1 if contigA <= contigB: if (contigA, contigB) not in dContigPairs: dContigPairs[contigA, contigB] = [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] else: dContigPairs[contigA, contigB] += [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigA, startA, stopA, senseA, contigB, startB, stopB, senseB)) else: if (contigB, contigA) not in dContigPairs: dContigPairs[contigB, contigA] = [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] else: dContigPairs[contigB, contigA] += [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigB, startB, stopB, senseB, contigA, startA, stopA, senseA)) if nReadsPairs % 5000000 == 0: elapsedTime = float((time.time() - startTime) / 60) agBAMProgress.logger.info("%d parsed\t| %s\t| %.2f m" % (nReadsPairs, readsID, elapsedTime)) agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs)) agBAMProgress.logger.info("%d contig pairs given by these joining pairs" % (len(dContigPairs))) if nJoinPairs == 0: agBAMProgress.logger.error("No joining pairs extracted") agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs") sys.exit(1) else: agBAMProgress.logger.info("Succeeded") return dContigPairs
def agouti_sam_main(bamFile, outDir, prefix, overwrite, minMapQ, minFracOvl, maxFracMismatch, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_join_pairs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join( moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix)) agBAMOutAllJoinPairs = os.path.join( moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix)) agBAMProgress = agLOG.PROGRESS_METER(moduleName) if not os.path.exists(progressLogFile): agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") else: if not overwrite: agBAMProgress.add_file_handler(progressLogFile, 'a') dContigPairs = retrieve_joininng_pairs(agBAMProgress, agBAMOutAllJoinPairs) if dContigPairs is not None: return dContigPairs else: agBAMProgress.logger.info( "Fail to pick up results from the previous run") agBAMProgress.logger.info("Re-processing the BAM file") else: agBAMProgress.add_file_handler(progressLogFile) agBAMProgress.logger.info("[BEGIN] Identifying joining pairs") agBAMProgress.logger.info( "Overwrite results from the previous run") agBAMDebug = None if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_join_pairs.debug" % (prefix)) agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile) # before running samtools, check its availability agBAMProgress.logger.info("check SAMtools") check_samtools(agBAMProgress) # runing samtools agBAMProgress.logger.info("run SAMtools") try: with open(agBAMOutAllJoinPairs, 'w') as fOUT: agBAMProgress.logger.info( "# processed\t| Current Reads ID\t| Elapsed Time") if debug: agBAMDebug.debugger.debug( "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB" ) startTime = time.time() dContigPairs = collections.defaultdict(list) nJoinPairs = 0 nReadsPairs = 0 for record in run_samtools(bamFile, agBAMProgress): tmpRecord = record.split("\n") pairA = tmpRecord[0].split("\t") pairB = tmpRecord[1].split("\t") readsID = pairA[0] contigA = pairA[2] contigB = pairB[2] mateCtgB = pairA[6] mateCtgA = pairB[6] nReadsPairs += 1 # the first contidition makes sure # single end BAM are gonna have zero # joining-pairs extracted if contigA == "*" or contigB == "*": continue if pairA[0] == pairB[0] and contigA != contigB: alnLenA = getCIGAR(pairA[5]) alnLenB = getCIGAR(pairB[5]) leftMostPosA = int(pairA[3]) # 1-based in SAM leftMostPosB = int(pairB[3]) readLenA = len(pairA[9]) readLenB = len(pairB[9]) nMismatchesA = getMismatches(pairA[11:]) nMismatchesB = getMismatches(pairB[11:]) mapQA = int(pairA[4]) mapQB = int(pairB[4]) flagsA = explainSAMFlag(int(pairA[1])) flagsB = explainSAMFlag(int(pairB[1])) senseA = flagsA[4] senseB = flagsB[4] if debug: agBAMDebug.debugger.debug( "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d" % (readsID, contigA + ":" + str(leftMostPosA), contigB + ":" + str(leftMostPosB), int(alnLenA), int(alnLenB), mapQA, mapQB, senseA, senseB, readLenA, readLenB)) fracOvlA = alnLenA / readLenA fracOvlB = alnLenB / readLenB fracMismatchA = nMismatchesA / alnLenA fracMismatchB = nMismatchesB / alnLenB if (min(fracOvlA, fracOvlB) >= minFracOvl and # minimum fraction of overlaps max(fracMismatchA, fracMismatchB) <= maxFracMismatch and # maximum fraction of mismatches min(mapQA, mapQB) >= minMapQ): # minimum mapping quality startA = leftMostPosA stopA = startA + int(alnLenA) - 1 startB = leftMostPosB stopB = startB + int(alnLenB) - 1 nJoinPairs += 1 if contigA <= contigB: if (contigA, contigB) not in dContigPairs: dContigPairs[contigA, contigB] = [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] else: dContigPairs[contigA, contigB] += [ (startA, startB, stopA, stopB, senseA, senseB, readsID) ] fOUT.write( "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigA, startA, stopA, senseA, contigB, startB, stopB, senseB)) else: if (contigB, contigA) not in dContigPairs: dContigPairs[contigB, contigA] = [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] else: dContigPairs[contigB, contigA] += [ (startB, startA, stopB, stopA, senseB, senseA, readsID) ] fOUT.write( "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" % (readsID, contigB, startB, stopB, senseB, contigA, startA, stopA, senseA)) if nReadsPairs % 5000000 == 0: elapsedTime = float((time.time() - startTime) / 60) agBAMProgress.logger.info( "%d parsed\t| %s\t| %.2f m" % (nReadsPairs, readsID, elapsedTime)) except KeyboardInterrupt: agBAMProgress.logger.info( "Extract Joining-pairs INTERRUPTED by Keyboard") sys.exit(1) agBAMProgress.logger.info("%d reads pairs in the give BAM" % (nReadsPairs)) agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs)) agBAMProgress.logger.info("%d contig pairs given by these joining pairs" % (len(dContigPairs))) if nJoinPairs == 0: agBAMProgress.logger.error("No joining pairs extracted") agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs") sys.exit(1) else: agBAMProgress.logger.info("Succeeded") return dContigPairs
def shred_assembly(assemblyFile, breakerProgress, prefix, minGaps, minCtgLen): ''' shred assembly at gaps of a minimum length ''' outDebugFile = prefix + ".shred_assembly.debug" breakDebug = agLOG.DEBUG("SHREDDER", outDebugFile) outFa = prefix + ".ctg.fasta" outInfo = prefix + ".shred.info.txt" outAGP = prefix + ".agp" dHeader2Intervals = collections.defaultdict(list) fAGP = open(outAGP, 'w') with open(outFa, 'w') as fOUTFA, open(outInfo, 'w') as fINFO: genomeSize = 0 splitSize = 0 numContigs = 0 contigLens = [] nSeqs = 0 startTime = time.time() breakerProgress.logger.info( "# processed\t| Current sequence ID\t| Elapsed Time") for header, seq in agSeq.read_fasta(assemblyFile): nSeqs += 1 breakDebug.debugger.debug(">%s" % (header)) genomeSize += len(seq) # m.start() and m.end() zero-based gapIndices = [(m.start(), m.end() - 1) for m in re.finditer("[N|n]{%d,}" % (minGaps), seq)] gapIndices.append((len(seq), -1)) breakDebug.debugger.debug("gapIndices: %s" % (str(gapIndices))) gapLens = [] intervals = [] if len(gapIndices) == 1: intervals.append((0, gapIndices[0][0])) elif gapIndices[-1][0] < minCtgLen: intervals.append((0, gapIndices[-1][0])) else: start = 0 i = 0 for i in range(len(gapIndices)): stop = gapIndices[i][0] breakDebug.debugger.debug("start %d stop %d" % (start, stop)) if gapIndices[len(gapIndices)-1][0]-start < minCtgLen and \ len(gapIndices) > 1: breakDebug.debugger.debug("last short") breakDebug.debugger.debug("gapIndices[i]: %s" % (str(gapIndices[i]))) breakDebug.debugger.debug("intervals: %s" % (intervals)) #if len(intervals) > 0: intervals[-1] = (intervals[-1][0], gapIndices[len(gapIndices) - 1][0]) #else: # intervals.append((start, gapIndices[len(gapIndices)-1][0])) break if stop - start + 1 < minCtgLen: breakDebug.debugger.debug("short") breakDebug.debugger.debug( "previous %s next %s" % (str(gapIndices[i - 1]), str(gapIndices[i]))) breakDebug.debugger.debug("length: %d" % (stop - start + 1)) continue if i < len(gapIndices) - 1: gapLens.append(gapIndices[i][1] - gapIndices[i][0]) intervals.append((start, stop)) start = gapIndices[i][1] + 1 breakDebug.debugger.debug("intervals: %s" % (intervals)) breakDebug.debugger.debug("gapLen: %s" % (str(gapLens))) contigs = [] for i in range(len(intervals)): dHeader2Intervals[header] += [intervals[i]] start = intervals[i][0] stop = intervals[i][1] splitSize += (stop - start) if len(intervals) == 1: contigID = "%s" % (header) fOUTFA.write(">%s\n%s\n" % (header, seq[start:stop])) else: contigID = "%s_%d" % (header, i) fOUTFA.write(">%s\n%s\n" % (contigID, seq[start:stop])) contigs.append(contigID) contigLens.append(stop - start) if i > 0: fAGP.write("%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n" % (header, intervals[i - 1][1] + 2, start, i + 1, start - intervals[i - 1][1] - 1)) fAGP.write("%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n" % (header, start + 1, stop + 1, i + 1, contigID, start + 1, stop - start + 1)) else: fAGP.write("%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n" % (header, start + 1, stop + 1, i + 1, contigID, start + 1, stop - start + 1)) numContigs += len(contigs) if nSeqs % 10000 == 0: elapsedTime = float((time.time() - startTime) / 60) breakerProgress.logger.info("%d processed\t| %s\t | %.2f m" % (nSeqs, header, elapsedTime)) fINFO.write(">%s\n" % (header)) if len(contigs) == 1: fINFO.write("%s\tNA\tNA\n" % (contigs[0])) continue for i in range(1, len(contigs)): fINFO.write("%s\t%s\t%d\n" % (contigs[i - 1], contigs[i], gapLens[i - 1])) if nSeqs < 10000: elapsedTime = float((time.time() - startTime) / 60) breakerProgress.logger.info("%d processed\t| %s\t | %.2f m" % (nSeqs, header, elapsedTime)) fAGP.close() n50 = agSeq.get_assembly_NXX(contigLens) breakerProgress.logger.info("Total length of the given assembly: %d" % (genomeSize)) breakerProgress.logger.info("Total length of the shred assembly: %d" % (splitSize)) breakerProgress.logger.info( "Number of sequences in the shred assembly: %d" % (numContigs)) breakerProgress.logger.info("N50 of the shred assembly: %d" % (n50)) return dHeader2Intervals
def shred_annotation(dHeader2Intervals, gffFile, prefix, breakerProgress): breakerProgress.logger.info("[BEGIN] Shredding annotation") outDebugFile = prefix + ".shred_annotation.debug" shredAnnDebug = agLOG.DEBUG("SHREDDER", outDebugFile) outGFF = prefix + ".ctg.gff" fOUT = open(outGFF, 'w') annotationType = [ "gene", "exon", "CDS", "five_prime_UTR", "three_prime_UTR" ] n = 1 preGene = "" preStrand = "" preSource = "" preHeader = "" preStart = 0 preStop = 0 dAttrs = {} dAttrsPre = {} features = [] dFeatures = collections.defaultdict(list) nGenes = 0 nShredGenes = 0 with open(gffFile, 'r') as fGFF: for line in fGFF: if line.startswith("##FASTA"): break if not line.startswith("#"): tmp_line = line.strip().split("\t") header = tmp_line[0] if header in dHeader2Intervals: #intervals = dHeader2Intervals[header] # no cut if len(dHeader2Intervals[header]) == 1: if tmp_line[2] in annotationType: fOUT.write(line) if tmp_line[2] == "gene": nGenes += 1 nShredGenes += 1 # get cut else: start = int(tmp_line[3]) stop = int(tmp_line[4]) if tmp_line[2] == "gene": nGenes += 1 dAttrs = get_attributes(tmp_line[8]) if "ID" in dAttrs: gene = dAttrs["ID"] else: gene = "agouti_shred_gene_%d" % (n) shredAnnDebug.debugger.debug(( "Warning: no gene ID extracted from attribute. " "Name given: %s" % (gene))) n += 1 strand = tmp_line[6] source = tmp_line[1] if preGene == "": preGene = gene preStart = start preStop = stop preStrand = strand preSource = source preHeader = header dAttrsPre = dAttrs else: if preGene != gene: shredAnnDebug.debugger.debug( "####%s [BEGIN]" % (preGene)) shredAnnDebug.debugger.debug( "====geneStart=%d geneStop=%d" % (preStart, preStop)) # here to get how many intervals a gene spans shreds = [] intervals = dHeader2Intervals[preHeader] for i in range(len(intervals)): interval = intervals[i] overlap = agDenoise.find_overlap( interval, (preStart, preStop)) if overlap == 0: shreds += [(i, interval[0] + 1, interval[1] + 1)] shredAnnDebug.debugger.debug( "====shreds=%s" % (str(shreds))) nShredGenes += len(shreds) shred_gene(shreds, preGene, preStart, preStop, preStrand, preSource, preHeader, features, dFeatures, dAttrsPre, shredAnnDebug, fOUT) shredAnnDebug.debugger.debug( "####%s [END]" % (preGene)) preGene = gene preStart = start preStop = stop preStrand = strand preSource = source preHeader = header dFeatures = {k: [] for k in features} dAttrsPre = dAttrs features = [] elif tmp_line[2] == "exon": if not "exon" in features: features.append("exon") dFeatures["exon"] = [(start, stop)] else: dFeatures["exon"] += [(start, stop)] elif tmp_line[2] == "CDS": if "CDS" not in features: dFeatures["CDS"] = [(start, stop)] features.append("CDS") else: dFeatures["CDS"] += [(start, stop)] elif tmp_line[2] == "five_prime_UTR": if not "five_prime_UTR" in features: features.append("five_prime_UTR") dFeatures["five_prime_UTR"] = [(start, stop)] else: dFeatures["five_prime_UTR"] += [(start, stop)] elif tmp_line[2] == "three_prime_UTR": if not "three_prime_UTR" in features: features.append("three_prime_UTR") dFeatures["three_prime_UTR"] = [(start, stop)] else: dFeatures["three_prime_UTR"] += [(start, stop)] else: if line.startswith("##gff"): fOUT.write(line) elif not line.startswith("##"): fOUT.write(line) # dealing with the last gene shredAnnDebug.debugger.debug("####%s [BEGIN]" % (preGene)) shredAnnDebug.debugger.debug("====geneStart=%d geneStop=%d" % (preStart, preStop)) shreds = [] intervals = dHeader2Intervals[preHeader] for i in range(len(intervals)): interval = intervals[i] overlap = agDenoise.find_overlap(interval, (preStart, preStop)) if overlap == 0: shreds += [(i, interval[0] + 1, interval[1] + 1)] nShredGenes += len(shreds) shred_gene(shreds, preGene, preStart, preStop, preStrand, preSource, preHeader, features, dFeatures, dAttrsPre, shredAnnDebug, fOUT) shredAnnDebug.debugger.debug("####%s [END]" % (preGene)) breakerProgress.logger.info("Number of genes in the give GFF: %d" % (nGenes)) breakerProgress.logger.info("Number of genes in the shred GFF: %d" % (nShredGenes)) fOUT.close()
def get_gene_models(gff, outDir, prefix, debug=0): moduleName = os.path.basename(__file__).split('.')[0].upper() moduleOutDir = os.path.join(outDir, "agouti_GFFs") if not os.path.exists(moduleOutDir): os.makedirs(moduleOutDir) progressLogFile = os.path.join(moduleOutDir, "%s.agouti_gff.progressMeter" % (prefix)) agGFFProgress = agLOG.PROGRESS_METER(moduleName) agGFFProgress.add_file_handler(progressLogFile) agGFFProgress.logger.info("[BEGIN] Getting gene models") dGFFs = collections.defaultdict(list) nGene = 0 with open(gff, 'r') as fIN: for line in fIN: if line.startswith("##FASTA") or line.startswith("##Fasta"): break # skip empty lines and lines starting with '#' if not line.startswith('#') and len(line.strip()) > 0: tmp_line = line.strip().split("\t") if tmp_line[2] == "gene": nGene += 1 if nGene == 0: agGFFProgress.logger.error("Found zero genes") agGFFProgress.logger.error("Please check your GFF file") sys.exit(1) lobj_GeneModels = [AGOUTI_GFF() for i in xrange(nGene)] geneIndex = -1 stop = 0 fIN.seek(0) for line in fIN: # Stop before getting into FASTA zone if line.startswith("##FASTA") or line.startswith("##Fasta"): stop = 1 break # skip empty lines and lines starting with '#' if not line.startswith('#') and line.strip(): tmp_line = line.strip().split("\t") if tmp_line[2] == "gene": geneIndex += 1 attrs = tmp_line[8].split(';') for attr in attrs: attrID, attrVal = attr.split('=') if attrID == "ID": geneID = attrVal break #m = re.search("(;ID=.+;|ID=.+;|ID=.+|;ID=.+)", tmp_line[8]) #print m.group() #geneID = m.group().strip(';').split('=')[1] if geneIndex == 0: #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1], # int(tmp_line[3]), # int(tmp_line[4])) lobj_GeneModels[geneIndex].setGene( geneID, int(tmp_line[3]), int(tmp_line[4])) else: preCtgID = lobj_GeneModels[geneIndex - 1].ctgID preGeneID = lobj_GeneModels[geneIndex - 1].geneID dGFFs[preCtgID].append(lobj_GeneModels[geneIndex - 1]) #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1], # int(tmp_line[3]), # int(tmp_line[4])) lobj_GeneModels[geneIndex].setGene( geneID, int(tmp_line[3]), int(tmp_line[4])) lobj_GeneModels[geneIndex].setProgram(tmp_line[1]) lobj_GeneModels[geneIndex].setContigID(tmp_line[0]) lobj_GeneModels[geneIndex].setStrand(tmp_line[6]) elif tmp_line[2] == "stop_codon": lobj_GeneModels[geneIndex].setStopCodon() elif tmp_line[2] == "start_codon": lobj_GeneModels[geneIndex].setStartCodon() elif tmp_line[2] == "CDS": lobj_GeneModels[geneIndex].updateCDS( int(tmp_line[3]), int(tmp_line[4])) if not stop and geneIndex >= 0: dGFFs[lobj_GeneModels[geneIndex].ctgID].append( lobj_GeneModels[geneIndex]) if debug: debugLogFile = os.path.join(moduleOutDir, "%s.agouti_gff.debug" % (prefix)) agGFFDebug = agLOG.DEBUG(moduleName, debugLogFile) agGFFDebug.debugger.debug("Sequence\tNum_Gene_Models") nGeneModels = 0 for k, v in sorted(dGFFs.items()): genes = [(gene.geneStart, gene.geneStop) for gene in v] # make sure gene model are in ascending order soGenes = sorted(xrange(len(genes)), key=lambda k: genes[k]) tmpV = [] for i in xrange(len(soGenes)): index = soGenes[i] tmpV.append(v[index]) dGFFs[k] = tmpV nGeneModels += len(tmpV) if debug: agGFFDebug.debugger.debug("%s\t%d" % (k, len(tmpV))) agGFFProgress.logger.info("%d Gene Models parsed" % (nGeneModels)) agGFFProgress.logger.info("[DONE]") return dGFFs