Example #1
0
def agouti_path_main(agoutiPaths, dSenses, vertex2Name, dGFFs,
                     dCtgPair2GenePair, oriScafPathFile, outDir, prefix):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_path")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)
    agPathProgress = agLOG.PROGRESS_METER(moduleName)
    agPathProgress.logger.info("Analyzing scaffolding paths")
    outDebugFile = os.path.join(moduleOutDir, prefix) + ".agouti_path.debug"
    agPathDebug = agLOG.DEBUG("SHREDDER", outDebugFile)
    agPathProgress.logger.info("[BEGIN] Reading file with shred info")
    dOriPaths, dOriGaps = read_original_path(oriScafPathFile, agPathProgress)
    agPathProgress.logger.info("[DONE]")

    # shut it off for now; working to improve it
    #agPathProgress.logger.info("[BEGIN] Checking consistency")
    #compare(dOriPaths, agoutiPaths, vertex2Name, outDir, prefix)
    #agPathProgress.logger.info("[DONE]")

    report_consistency(agoutiPaths, dOriPaths, vertex2Name, outDir, prefix)

    agPathProgress.logger.info("[BEGIN] Recovring original scaffolding")
    agoutiPaths, dCtgPair2GenePair, dSenses = recover_untouched_sequences(
        dOriPaths, agoutiPaths, vertex2Name, dGFFs, dCtgPair2GenePair, dSenses,
        agPathProgress, agPathDebug)
    agPathProgress.logger.info("[DONE]")

    return agoutiPaths, dCtgPair2GenePair, dSenses
Example #2
0
def agouti_update(agoutiPaths,
                  dSeqs,
                  vertex2Name,
                  dSenses,
                  dGFFs,
                  dCtgPair2GenePair,
                  outDir,
                  prefix,
                  nFills=1000,
                  debug=0,
                  no_update_gff=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_update")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(moduleOutDir,
                                   "%s.agouti_update.progressMeter" % (prefix))
    global agUPDATEProgress
    agUPDATEProgress = agLOG.PROGRESS_METER(moduleName)
    agUPDATEProgress.add_file_handler(progressLogFile)
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_update.debug" % (prefix))
        global agUPDATEDebug
        agUPDATEDebug = agLOG.DEBUG(moduleName, debugLogFile)

    if not no_update_gff:
        agUPDATEProgress.logger.info("[BEGIN] Updating gene models")

    outFasta = os.path.join(outDir, "%s.agouti.fasta" % (prefix))
    fFASTA = open(outFasta, 'w')
    dUpdateGFFs = collections.defaultdict(list)
    dMergedGene2Ctgs = collections.defaultdict(list)
    dMergedGene2Genes = collections.defaultdict(list)
    scafPaths = []
    numMergedGene = 0
    nCtgScaffolded = 0
    scaffoldedCtgs = {}
    seqLens = []
    dScafGaps = {}
    dScafStats = {}
    scafID = 0
    mergedGenes = []
    for i in range(len(agoutiPaths)):
        path = agoutiPaths[i]
        scafID += 1
        scafName = prefix + "_scaf_%d" % (scafID)
        dScafStats[scafName] = 0
        dScafGaps[scafName] = []

        curVertex = path[0]
        sequence = dSeqs[curVertex]
        curSense = "+"
        curCtg = vertex2Name[curVertex]
        preCtg = ""
        scafPath = [curVertex]
        preGeneID, curGeneID = "", ""
        mergedGene = agGFF.AGOUTI_GFF()
        preMergedGene = agGFF.AGOUTI_GFF()
        gapStart, gapStop = 0, 0
        offset = 0
        orientation = ""
        updatedGeneIDs = []
        mergedGenesPerPath = []
        excludeGeneIDs = []
        for nextVertex in path[1:]:
            nextCtg = vertex2Name[nextVertex]

            if preCtg == "":
                if debug:
                    agUPDATEDebug.debugger.debug(
                        "UPDATE_MAIN\t>scaf_%d - path - %s" %
                        (scafID, str([vertex2Name[vertex]
                                      for vertex in path])))
            if debug:
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tcurVertex - %d - %s - nextVertex - %d - %s"
                    % (curVertex, curCtg, nextVertex, nextCtg))

            if not no_update_gff:
                #curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curCtg, nextCtg)
                curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair,
                                                     curVertex, nextVertex)
                #!!! I should not break here, should continue#
                if curGene is None and nextGene is None:
                    agUPDATEProgress.logger.error(
                        "%s - %s found no gene models joining them" %
                        (curCtg, nextCtg))
                    agUPDATEProgress.logger.error(
                        "This is NOT EXPECTED, REPORT!")
                    sys.exit(1)
                curGeneID = curGene.geneID
                excludeGeneIDs = [preGeneID] + [curGeneID]
                if debug:
                    agUPDATEDebug.debugger.debug(
                        "UPDATE_MAIN\t\tpreGene - %s - curGene - %s - nextGene - %s"
                        % (preGeneID, curGene.geneID, nextGene.geneID))

            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafName - %s" %
                                             (scafName))
            FR, FF, RR, RF = get_orientation_counts(curVertex, nextVertex,
                                                    dSenses)
            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" %
                                             (curSense))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" %
                    (FR, FF, RF, RR))
            if curSense == "-":
                temp1 = FR
                temp2 = FF
                FR = RR
                FF = RF
                RR = temp1
                RF = temp2
            orientation = decide_orientation(FR, FF, RR, RF)

            gapStart = gapStop + len(dSeqs[curVertex])
            gapStop = gapStart + nFills - 1
            dScafGaps[scafName].append((gapStart + 1, gapStop + 1))
            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" %
                                             (curSense))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" %
                    (FR, FF, RF, RR))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\toffset - %d - curCtgLen - %d" %
                    (offset, len(dSeqs[curVertex])))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tgapstart - %d - gapstop - %d" %
                    (gapStart, gapStop + 1))
            valid = 0
            if orientation == "FR":
                if not no_update_gff:
                    if curGeneID != preGeneID:
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        #if curGene.geneStop != 0:
                        #	dMergedGene2Genes[mergedGene.geneID] += [curGeneID]
                        #if nextGene.geneStop != 0:
                        #	dMergedGene2Genes[mergedGene.geneID] += [nextGene.geneID]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        if nextGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                nextGene.geneID
                            ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence += 'N' * nFills + dSeqs[nextVertex]
                scafPath += [nextVertex]
                curSense = "+"
            elif orientation == "FF":
                if not no_update_gff:
                    #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug)
                    dGFFs[nextCtg] = reverse_gene_models(
                        dGFFs[nextCtg], len(dSeqs[nextVertex]), debug)
                    if curGeneID != preGeneID:
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence += 'N' * nFills + agSeq.rc_seq(dSeqs[nextVertex])
                scafPath += [-1 * nextVertex]
                curSense = "-"
            elif orientation == "RR":
                if not no_update_gff:
                    if curGene.geneID != preGeneID:
                        dGFFs[curCtg] = reverse_gene_models(
                            dGFFs[curCtg], len(dSeqs[curVertex]), debug)
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug)
                        dUpdateGFFs[scafName] = reverse_gene_models(
                            dUpdateGFFs[scafName], gapStart - 1, debug)
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence = agSeq.rc_seq(sequence) + \
                     'N'*nFills + dSeqs[nextVertex]
                scafPath[-1] = -1 * scafPath[-1]
                scafPath += [nextVertex]
                curSense = "+"
            elif orientation == "RF":
                if not no_update_gff:
                    dGFFs[nextCtg] = reverse_gene_models(
                        dGFFs[nextCtg], len(dSeqs[nextVertex]), debug)
                    if curGene.geneID != preGeneID:
                        dGFFs[curCtg] = reverse_gene_models(
                            dGFFs[curCtg], len(dSeqs[curVertex]), debug)
                        #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug)
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug)
                        dUpdateGFFs[scafName] = reverse_gene_models(
                            dUpdateGFFs[scafName],
                            gapStop + len(dSeqs[curVertex]), debug)
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence = agSeq.rc_seq(sequence) + \
                     'N'*nFills + \
                     agSeq.rc_seq(dSeqs[nextVertex])
                scafPath[-1] = -1 * scafPath[-1]
                scafPath += [-1 * nextVertex]
                curSense = "-"
            if debug:
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tscafPath in vertices updates- %s" %
                    (str(scafPath)))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tdMergedGene2Gene - %s" %
                    (str(dMergedGene2Genes[mergedGene.geneID])))
            if not no_update_gff:
                mergedGenesPerPath.append(mergedGene.geneID)
                preGeneID = nextGene.geneID
            offset = gapStop
            preCtg = curCtg
            curVertex = nextVertex
            curCtg = vertex2Name[curVertex]

        for i in range(len(scafPath)):
            v = scafPath[i]
            if v < 0:
                scafPath[i] = "-" + vertex2Name[-1 * v]
            else:
                scafPath[i] = vertex2Name[v]
        scafPaths += [scafPath]
        if debug:
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tscafPath in human-readable updates- %s" %
                (str(scafPath)))
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tappend last curCtg - %s" % (curCtg))
            agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafPath - %s" %
                                         (str(scafPath)))
        if not no_update_gff:
            excludeGeneIDs = [preGeneID]
            mergedGenes.append(mergedGenesPerPath)
            dUpdateGFFs[scafName], updatedGeneIDs = update_gene_model(
                dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset,
                excludeGeneIDs, debug)
        fFASTA.write(">%s |%dbp |%s\n%s\n" %
                     (scafName, len(sequence), ",".join(scafPath), sequence))
        dScafStats[scafName] = len(sequence)
        seqLens.append(len(sequence))
        #agPaths.append(scafPath)
        nCtgScaffolded += len(scafPath)
        scaffoldedCtgs.update(dict((contig, 1) for contig in scafPath))
        if debug:
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tmergedGenesPerPath - %s" %
                (str(mergedGenesPerPath)))
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t-------------------------------------")

    agPATH.report_scaffold_path(scafPaths, vertex2Name, outDir, prefix)

    # other contigs need to be output
    agUPDATEProgress.logger.info("Finalizing sequences")
    for vertex in dSeqs:
        if vertex2Name[vertex] in scaffoldedCtgs or "-" + vertex2Name[
                vertex] in scaffoldedCtgs:
            continue
        fFASTA.write(">%s\n%s\n" % (vertex2Name[vertex], dSeqs[vertex]))
        dScafStats[vertex2Name[vertex]] = len(dSeqs[vertex])
        seqLens.append(len(dSeqs[vertex]))
    fFASTA.close()
    n50 = agSeq.get_assembly_NXX(seqLens)

    agUPDATEProgress.logger.info("Outputting updated Gene Moddels")
    for vertex in dSeqs:
        if vertex2Name[vertex] in scaffoldedCtgs:
            if vertex2Name[vertex] in dGFFs:
                del dGFFs[vertex2Name[vertex]]
    if not no_update_gff:
        dFinalGFFs = dict(dGFFs, **dUpdateGFFs)
        numGenes = output_gff(dFinalGFFs, dMergedGene2Ctgs, dMergedGene2Genes,
                              dScafStats, dScafGaps, outDir, prefix)
        agUPDATEProgress.logger.info("Summarizing AGOUTI gene paths")
        summarize_gene_path(dMergedGene2Genes, dMergedGene2Ctgs, outDir,
                            prefix)

    agUPDATEProgress.logger.info("-----------Summary-----------")
    agUPDATEProgress.logger.info("number of contigs scaffoled: %d" %
                                 (nCtgScaffolded))
    agUPDATEProgress.logger.info("number of scaffolds: %d" % (scafID))
    agUPDATEProgress.logger.info(
        "number of contigs in the final assembly: %d" % (len(seqLens)))
    agUPDATEProgress.logger.info("Final assembly N50: %d" % (n50))
    if not no_update_gff:
        agUPDATEProgress.logger.info("Final number of genes: %d" % (numGenes))
    agUPDATEProgress.logger.info("Succeeded")
Example #3
0
def denoise_joining_pairs(dContigPairs,
                          dGFFs,
                          vertex2Name,
                          outDir,
                          prefix,
                          minSupport,
                          debug=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_denoise")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_denoise.progressMeter" % (prefix))
    agDENOISEProgress = agLOG.PROGRESS_METER(moduleName)
    agDENOISEProgress.add_file_handler(progressLogFile)

    debugLogFile = ""
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_denoise.debug" % (prefix))
        global agDENOISEDebug
        agDENOISEDebug = agLOG.DEBUG(moduleName, debugLogFile)

    agDENOISEProgress.logger.info("[BEGIN] Denoising joining pairs")
    startTime = time.clock()
    dCtgPair2GenePair = collections.defaultdict()
    dCtgPairDenoise = collections.defaultdict()
    dMappedPos = collections.defaultdict()
    daddedModels = collections.defaultdict(list)
    nFail4Combination = 0
    nFailGeneModel = 0
    nFailK = 0
    outDenoiseJPFile = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.noise_free.txt" % (prefix))
    fOUT = open(outDenoiseJPFile, 'w')
    for ctgPair, pairInfo in dContigPairs.items():
        if len(pairInfo) < minSupport:
            nFailK += 1
            del dContigPairs[ctgPair]
            continue
        ctgA = ctgPair[0]
        ctgB = ctgPair[1]
        if debug:
            agDENOISEDebug.debugger.debug(
                "DENOISE_MAIN\t>contigA - %s - contigB - %s" % (ctgA, ctgB))
        pairToRemove = []
        mapIntervalsA = []
        mapIntervalsB = []
        pairs = []
        senses = []
        keep = 0
        for i in xrange(len(pairInfo)):
            startA, startB, stopA, stopB, senseA, senseB, readID = pairInfo[i]
            mapIntervalsA += [(startA, stopA)]
            mapIntervalsB += [(startB, stopB)]
            pairs += [(startA, stopA, startB, stopB)]
            senses += [(senseA, senseB)]
        genePair = get_genePair_for_contigPair(dGFFs, ctgA, ctgB,
                                               mapIntervalsA, mapIntervalsB,
                                               senses, debug)
        geneModelsA = dGFFs[ctgA]
        geneModelsB = dGFFs[ctgB]
        if genePair is None:
            nFailGeneModel += 1
            if debug:
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tFail to find a pair of gene models")
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\t----------------------------------")
        else:
            geneIndexA, geneIndexB, endA, endB, intervalsA, intervalsB, senses = genePair
            sensesCounter = collections.Counter(senses)
            if debug:
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tsensesCounter: %s" % (str(sensesCounter)))
            if geneIndexB != 0:
                # create gene model according to endB using intervalsB
                if geneIndexB == -1 and (endB == 5 or endB == 0):
                    dGFFs[ctgB] = create_fake_genes(geneModelsB, 0, ctgB,
                                                    intervalsB, debug)
                    geneIndexB = 0
                    endB = 5
                elif geneIndexB == 1 and (endB == 3 or endB == 0):
                    dGFFs[ctgB] = create_fake_genes(geneModelsB,
                                                    len(geneModelsB), ctgB,
                                                    intervalsB, debug)
                    geneIndexB = len(dGFFs[ctgB]) - 1
                    endB = 3
            else:
                if endB == 0:
                    endB = 5
                elif endB == 3:
                    geneIndexB = len(dGFFs[ctgB]) - 1
            if geneIndexA != 0:
                # create gene model according to endA using intervalsA
                if geneIndexA == -1 and (endA == 5 or endA == 0):
                    dGFFs[ctgA] = create_fake_genes(geneModelsA, 0, ctgA,
                                                    intervalsA, debug)
                    geneIndexA = 0
                    endA = 5
                elif geneIndexA == 1 and (endA == 3 or endA == 0):
                    dGFFs[ctgA] = create_fake_genes(geneModelsA,
                                                    len(geneModelsA), ctgA,
                                                    intervalsA, debug)
                    geneIndexA = len(dGFFs[ctgA]) - 1
                    endA = 3
            else:
                if endA == 0:
                    endA = 3
                elif endA == 3:
                    geneIndexA = len(dGFFs[ctgA]) - 1
            if debug:
                agDENOISEDebug.debugger.debug("DENOISE_MAIN\tgenePair: %s" %
                                              (str(genePair)))
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\t# models on ctgA - %d - # models on ctgB - %d"
                    % (len(dGFFs[ctgA]), len(dGFFs[ctgB])))
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tgeneIndexA - %d - endA - %d - geneIndexB - %d - endB - %d"
                    % (geneIndexA, endA, geneIndexB, endB))
            sense = sorted(sensesCounter.items(),
                           key=operator.itemgetter(1),
                           reverse=True)[0][0]
            if debug:
                agDENOISEDebug.debugger.debug("DENOISE_MAIN\tsensePair - %s" %
                                              (str(sense)))
            if (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \
               (geneIndexB == 0 and endB == 5) and sense == ('+', '-'):
                # FR + 3'-5'
                keep = 1
            elif (geneIndexA == 0 and endA == 5) and \
              (geneIndexB == 0 and endB == 5) and sense == ('-', '-'):
                # RR + 5'-5'
                keep = 1
            elif (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \
              (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \
              sense == ('+', '+'):
                # FF + 3'-3'
                keep = 1
            elif (geneIndexA == 0 and endA == 5) and \
              (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \
              sense == ('-', '+'):
                # RF + 5'-3'
                keep = 1
            elif (geneIndexA == 0 and (endA == 0 or endA == 3)) and \
              (geneIndexB == 0 and (endB == 0 or endB == 5)) and \
              sense == ('+', '-'):
                # only one gene on the contig
                # it doesn't matter which end
                keep = 1
            if keep:
                geneA = dGFFs[ctgA][geneIndexA]
                geneB = dGFFs[ctgB][geneIndexB]
                dCtgPair2GenePair[vertex2Name.index(ctgA),
                                  vertex2Name.index(ctgB)] = [geneA, geneB]
                if debug:
                    agDENOISEDebug.debugger.debug("DENOISE_MAIN\tNOISE-FREE")
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\tgeneA ID - %s - startA - %d - stopA = %d"
                        % (geneA.geneID, geneA.geneStart, geneA.geneStop))
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\tgeneB ID - %s - startB - %d - stopB = %d"
                        % (geneB.geneID, geneB.geneStart, geneB.geneStop))
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\t----------------------------------")
                senseA = sense[0]
                senseB = sense[1]
                weight = 0
                for i in xrange(len(pairInfo)):
                    startA, startB, stopA, stopB, _, _, readID = pairInfo[i]
                    intervalA = (startA, stopA)
                    intervalB = (startB, stopB)
                    #print "intervalA", intervalA, "intervalB", intervalB
                    if len(intervalsA) == 0:
                        if len(intervalsB) == 0:
                            #print "use all"
                            fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                       (readID, ctgA, startA, senseA, ctgB,
                                        startB, senseB))
                            weight += 1
                        else:
                            #print "use all A, not all B"
                            overlap = find_overlap(
                                intervalB, (geneB.geneStart, geneB.geneStop))
                            if overlap == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                    else:
                        if len(intervalsB) == 0:
                            #print "use all B, not all A"
                            overlap = find_overlap(
                                intervalA, (geneA.geneStart, geneA.geneStop))
                            if overlap == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                        else:
                            #print "not all Both"
                            overlapA = find_overlap(
                                intervalA, (geneA.geneStart, geneA.geneStop))
                            overlapB = find_overlap(
                                intervalB, (geneB.geneStart, geneB.geneStop))
                            if overlapA == 0 and overlapB == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                dCtgPairDenoise[vertex2Name.index(ctgA),
                                vertex2Name.index(ctgB)] = [
                                    weight, (senseA, senseB)
                                ]
            else:
                nFail4Combination += 1
#			if len(sensesCounter) == 1:
#				sense = sensesCounter.keys()[0]
#			else:
#				print "multiple sense pairs"
#				senses = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0:2]
#				print "senses", senses
#				ratio = float(senses[0][1])/(senses[0][1]+senses[1][1])
#				print "ratio", ratio
    fOUT.close()
    agDENOISEProgress.logger.info("Succeeded")
    agDENOISEProgress.logger.info("Denoise took in %.2f min CPU time" %
                                  ((time.clock() - startTime) / 60))
    agDENOISEProgress.logger.info(
        "%d contig pairs filtered for spanning across >1 gene models" %
        (nFailGeneModel))
    agDENOISEProgress.logger.info(
        "%d contig pairs filtered for not being one of the four combinations" %
        (nFail4Combination))
    agDENOISEProgress.logger.info("%d contig pairs filtered for less support" %
                                  (nFailK))
    agDENOISEProgress.logger.info("%d contig pairs for scaffolding" %
                                  (len(dCtgPairDenoise)))
    return dCtgPair2GenePair, dCtgPairDenoise
Example #4
0
def get_joining_pairs(bamStream,
                      outDir,
                      prefix,
                      overwrite,
                      minMapQ=5,
                      minFracOvl=0.0,
                      maxFracMismatch=1.0,
                      debug=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_join_pairs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix))
    agBAMOutAllJoinPairs = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix))
    agBAMProgress = agLOG.PROGRESS_METER(moduleName)
    if not os.path.exists(progressLogFile):
        agBAMProgress.add_file_handler(progressLogFile)
        agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
    else:
        if not overwrite:
            agBAMProgress.add_file_handler(progressLogFile, 'a')
            dContigPairs = retrieve_joininng_pairs(agBAMProgress,
                                                   agBAMOutAllJoinPairs)
            if dContigPairs is not None:
                return dContigPairs
            else:
                agBAMProgress.logger.info(
                    "Fail to pick up results from the previous run")
                agBAMProgress.logger.info("Re-processing the BAM file")
        else:
            agBAMProgress.add_file_handler(progressLogFile)
            agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
            agBAMProgress.logger.info(
                "Overwrite results from the previous run")

    agBAMDebug = None
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_join_pairs.debug" % (prefix))
        agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile)

    with open(agBAMOutAllJoinPairs, 'w') as fOUT:
        agBAMProgress.logger.info(
            "# processed\t| Current Reads ID\t| Elapsed Time")
        if debug:
            agBAMDebug.debugger.debug(
                "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB"
            )
        startTime = time.time()
        dContigPairs = collections.defaultdict(list)
        nJoinPairs = 0
        nReadsPairs = 0
        while True:
            pairA = bamStream.readline().strip().split("\t")
            pairB = bamStream.readline().strip().split("\t")
            # reach the end of the file
            if len(pairA) == 1 or len(pairB) == 1:
                break
            readsID = pairA[0]
            contigA = pairA[2]
            contigB = pairB[2]
            nReadsPairs += 1
            if pairA[0] == pairB[0] and contigA != contigB:
                alnLenA = getCIGAR(pairA[5])
                alnLenB = getCIGAR(pairB[5])
                leftMostPosA = int(pairA[3])
                leftMostPosB = int(pairB[3])
                readLenA = len(pairA[9])
                readLenB = len(pairB[9])
                nMismatchesA = getMismatches(pairA[11:])
                nMismatchesB = getMismatches(pairB[11:])
                mapQA = int(pairA[4])
                mapQB = int(pairB[4])
                flagsA = explainSAMFlag(int(pairA[1]))
                flagsB = explainSAMFlag(int(pairB[1]))
                senseA = flagsA[4]
                senseB = flagsB[4]
                if debug:
                    agBAMDebug.debugger.debug(
                        "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d" %
                        (readsID, contigA + ":" + str(leftMostPosA),
                         contigB + ":" + str(leftMostPosB), mapQA, mapQB,
                         senseA, senseB, readLenA, readLenB))

                if (min(alnLenA / readLenA, alnLenB / readLenB) >= minFracOvl
                        and  # minimum fraction of overlaps
                        max(nMismatchesA / alnLenA, nMismatchesB / alnLenB) <=
                        maxFracMismatch and  # maximum fraction of mismatches
                        min(mapQA,
                            mapQB) >= minMapQ):  # minimum mapping quality
                    startA = leftMostPosA + 1
                    stopA = startA + 1 + int(alnLenA)
                    startB = leftMostPosB + 1
                    stopB = startB + 1 + int(alnLenB)
                    nJoinPairs += 1
                    if contigA <= contigB:
                        if (contigA, contigB) not in dContigPairs:
                            dContigPairs[contigA, contigB] = [
                                (startA, startB, stopA, stopB, senseA, senseB,
                                 readsID)
                            ]
                        else:
                            dContigPairs[contigA, contigB] += [
                                (startA, startB, stopA, stopB, senseA, senseB,
                                 readsID)
                            ]
                        fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                   (readsID, contigA, startA, stopA, senseA,
                                    contigB, startB, stopB, senseB))
                    else:
                        if (contigB, contigA) not in dContigPairs:
                            dContigPairs[contigB, contigA] = [
                                (startB, startA, stopB, stopA, senseB, senseA,
                                 readsID)
                            ]
                        else:
                            dContigPairs[contigB, contigA] += [
                                (startB, startA, stopB, stopA, senseB, senseA,
                                 readsID)
                            ]
                        fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                   (readsID, contigB, startB, stopB, senseB,
                                    contigA, startA, stopA, senseA))
            if nReadsPairs % 5000000 == 0:
                elapsedTime = float((time.time() - startTime) / 60)
                agBAMProgress.logger.info("%d parsed\t| %s\t| %.2f m" %
                                          (nReadsPairs, readsID, elapsedTime))

    agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs))
    agBAMProgress.logger.info("%d contig pairs given by these joining pairs" %
                              (len(dContigPairs)))
    if nJoinPairs == 0:
        agBAMProgress.logger.error("No joining pairs extracted")
        agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs")
        sys.exit(1)
    else:
        agBAMProgress.logger.info("Succeeded")
    return dContigPairs
Example #5
0
def agouti_sam_main(bamFile,
                    outDir,
                    prefix,
                    overwrite,
                    minMapQ,
                    minFracOvl,
                    maxFracMismatch,
                    debug=0):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_join_pairs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix))
    agBAMOutAllJoinPairs = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix))
    agBAMProgress = agLOG.PROGRESS_METER(moduleName)
    if not os.path.exists(progressLogFile):
        agBAMProgress.add_file_handler(progressLogFile)
        agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
    else:
        if not overwrite:
            agBAMProgress.add_file_handler(progressLogFile, 'a')
            dContigPairs = retrieve_joininng_pairs(agBAMProgress,
                                                   agBAMOutAllJoinPairs)
            if dContigPairs is not None:
                return dContigPairs
            else:
                agBAMProgress.logger.info(
                    "Fail to pick up results from the previous run")
                agBAMProgress.logger.info("Re-processing the BAM file")
        else:
            agBAMProgress.add_file_handler(progressLogFile)
            agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
            agBAMProgress.logger.info(
                "Overwrite results from the previous run")

    agBAMDebug = None
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_join_pairs.debug" % (prefix))
        agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile)

    # before running samtools, check its availability
    agBAMProgress.logger.info("check SAMtools")
    check_samtools(agBAMProgress)

    # runing samtools
    agBAMProgress.logger.info("run SAMtools")

    try:
        with open(agBAMOutAllJoinPairs, 'w') as fOUT:
            agBAMProgress.logger.info(
                "# processed\t| Current Reads ID\t| Elapsed Time")
            if debug:
                agBAMDebug.debugger.debug(
                    "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB"
                )
            startTime = time.time()
            dContigPairs = collections.defaultdict(list)
            nJoinPairs = 0
            nReadsPairs = 0
            for record in run_samtools(bamFile, agBAMProgress):
                tmpRecord = record.split("\n")
                pairA = tmpRecord[0].split("\t")
                pairB = tmpRecord[1].split("\t")
                readsID = pairA[0]
                contigA = pairA[2]
                contigB = pairB[2]
                mateCtgB = pairA[6]
                mateCtgA = pairB[6]
                nReadsPairs += 1
                # the first contidition makes sure
                # single end BAM are gonna have zero
                # joining-pairs extracted
                if contigA == "*" or contigB == "*":
                    continue
                if pairA[0] == pairB[0] and contigA != contigB:
                    alnLenA = getCIGAR(pairA[5])
                    alnLenB = getCIGAR(pairB[5])
                    leftMostPosA = int(pairA[3])  # 1-based in SAM
                    leftMostPosB = int(pairB[3])
                    readLenA = len(pairA[9])
                    readLenB = len(pairB[9])
                    nMismatchesA = getMismatches(pairA[11:])
                    nMismatchesB = getMismatches(pairB[11:])
                    mapQA = int(pairA[4])
                    mapQB = int(pairB[4])
                    flagsA = explainSAMFlag(int(pairA[1]))
                    flagsB = explainSAMFlag(int(pairB[1]))
                    senseA = flagsA[4]
                    senseB = flagsB[4]
                    if debug:
                        agBAMDebug.debugger.debug(
                            "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d" %
                            (readsID, contigA + ":" + str(leftMostPosA),
                             contigB + ":" + str(leftMostPosB), int(alnLenA),
                             int(alnLenB), mapQA, mapQB, senseA, senseB,
                             readLenA, readLenB))

                    fracOvlA = alnLenA / readLenA
                    fracOvlB = alnLenB / readLenB
                    fracMismatchA = nMismatchesA / alnLenA
                    fracMismatchB = nMismatchesB / alnLenB
                    if (min(fracOvlA, fracOvlB) >= minFracOvl
                            and  # minimum fraction of overlaps
                            max(fracMismatchA,
                                fracMismatchB) <= maxFracMismatch
                            and  # maximum fraction of mismatches
                            min(mapQA,
                                mapQB) >= minMapQ):  # minimum mapping quality
                        startA = leftMostPosA
                        stopA = startA + int(alnLenA) - 1
                        startB = leftMostPosB
                        stopB = startB + int(alnLenB) - 1
                        nJoinPairs += 1
                        if contigA <= contigB:
                            if (contigA, contigB) not in dContigPairs:
                                dContigPairs[contigA, contigB] = [
                                    (startA, startB, stopA, stopB, senseA,
                                     senseB, readsID)
                                ]
                            else:
                                dContigPairs[contigA, contigB] += [
                                    (startA, startB, stopA, stopB, senseA,
                                     senseB, readsID)
                                ]
                            fOUT.write(
                                "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                (readsID, contigA, startA, stopA, senseA,
                                 contigB, startB, stopB, senseB))
                        else:
                            if (contigB, contigA) not in dContigPairs:
                                dContigPairs[contigB, contigA] = [
                                    (startB, startA, stopB, stopA, senseB,
                                     senseA, readsID)
                                ]
                            else:
                                dContigPairs[contigB, contigA] += [
                                    (startB, startA, stopB, stopA, senseB,
                                     senseA, readsID)
                                ]
                            fOUT.write(
                                "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                (readsID, contigB, startB, stopB, senseB,
                                 contigA, startA, stopA, senseA))
                if nReadsPairs % 5000000 == 0:
                    elapsedTime = float((time.time() - startTime) / 60)
                    agBAMProgress.logger.info(
                        "%d parsed\t| %s\t| %.2f m" %
                        (nReadsPairs, readsID, elapsedTime))
    except KeyboardInterrupt:
        agBAMProgress.logger.info(
            "Extract Joining-pairs INTERRUPTED by Keyboard")
        sys.exit(1)

    agBAMProgress.logger.info("%d reads pairs in the give BAM" % (nReadsPairs))
    agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs))
    agBAMProgress.logger.info("%d contig pairs given by these joining pairs" %
                              (len(dContigPairs)))
    if nJoinPairs == 0:
        agBAMProgress.logger.error("No joining pairs extracted")
        agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs")
        sys.exit(1)
    else:
        agBAMProgress.logger.info("Succeeded")
    return dContigPairs
Example #6
0
def shred_assembly(assemblyFile, breakerProgress, prefix, minGaps, minCtgLen):
    '''
		shred assembly at gaps of a minimum length
	'''
    outDebugFile = prefix + ".shred_assembly.debug"
    breakDebug = agLOG.DEBUG("SHREDDER", outDebugFile)
    outFa = prefix + ".ctg.fasta"
    outInfo = prefix + ".shred.info.txt"
    outAGP = prefix + ".agp"
    dHeader2Intervals = collections.defaultdict(list)
    fAGP = open(outAGP, 'w')
    with open(outFa, 'w') as fOUTFA, open(outInfo, 'w') as fINFO:
        genomeSize = 0
        splitSize = 0
        numContigs = 0
        contigLens = []
        nSeqs = 0
        startTime = time.time()
        breakerProgress.logger.info(
            "# processed\t| Current sequence ID\t| Elapsed Time")
        for header, seq in agSeq.read_fasta(assemblyFile):
            nSeqs += 1
            breakDebug.debugger.debug(">%s" % (header))
            genomeSize += len(seq)
            # m.start() and m.end() zero-based
            gapIndices = [(m.start(), m.end() - 1)
                          for m in re.finditer("[N|n]{%d,}" % (minGaps), seq)]
            gapIndices.append((len(seq), -1))
            breakDebug.debugger.debug("gapIndices: %s" % (str(gapIndices)))
            gapLens = []
            intervals = []
            if len(gapIndices) == 1:
                intervals.append((0, gapIndices[0][0]))
            elif gapIndices[-1][0] < minCtgLen:
                intervals.append((0, gapIndices[-1][0]))
            else:
                start = 0
                i = 0
                for i in range(len(gapIndices)):
                    stop = gapIndices[i][0]
                    breakDebug.debugger.debug("start %d stop %d" %
                                              (start, stop))
                    if gapIndices[len(gapIndices)-1][0]-start < minCtgLen and \
                       len(gapIndices) > 1:
                        breakDebug.debugger.debug("last short")
                        breakDebug.debugger.debug("gapIndices[i]: %s" %
                                                  (str(gapIndices[i])))
                        breakDebug.debugger.debug("intervals: %s" %
                                                  (intervals))
                        #if len(intervals) > 0:
                        intervals[-1] = (intervals[-1][0],
                                         gapIndices[len(gapIndices) - 1][0])
                        #else:
                        #	intervals.append((start, gapIndices[len(gapIndices)-1][0]))
                        break
                    if stop - start + 1 < minCtgLen:
                        breakDebug.debugger.debug("short")
                        breakDebug.debugger.debug(
                            "previous %s next %s" %
                            (str(gapIndices[i - 1]), str(gapIndices[i])))
                        breakDebug.debugger.debug("length: %d" %
                                                  (stop - start + 1))
                        continue
                    if i < len(gapIndices) - 1:
                        gapLens.append(gapIndices[i][1] - gapIndices[i][0])
                    intervals.append((start, stop))
                    start = gapIndices[i][1] + 1
            breakDebug.debugger.debug("intervals: %s" % (intervals))
            breakDebug.debugger.debug("gapLen: %s" % (str(gapLens)))
            contigs = []
            for i in range(len(intervals)):
                dHeader2Intervals[header] += [intervals[i]]
                start = intervals[i][0]
                stop = intervals[i][1]
                splitSize += (stop - start)
                if len(intervals) == 1:
                    contigID = "%s" % (header)
                    fOUTFA.write(">%s\n%s\n" % (header, seq[start:stop]))
                else:
                    contigID = "%s_%d" % (header, i)
                    fOUTFA.write(">%s\n%s\n" % (contigID, seq[start:stop]))
                contigs.append(contigID)
                contigLens.append(stop - start)
                if i > 0:
                    fAGP.write("%s\t%d\t%d\t%d\tN\t%d\tfragment\tyes\n" %
                               (header, intervals[i - 1][1] + 2, start, i + 1,
                                start - intervals[i - 1][1] - 1))
                    fAGP.write("%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n" %
                               (header, start + 1, stop + 1, i + 1, contigID,
                                start + 1, stop - start + 1))
                else:
                    fAGP.write("%s\t%d\t%d\t%d\tW\t%s\t%d\t%d\t+\n" %
                               (header, start + 1, stop + 1, i + 1, contigID,
                                start + 1, stop - start + 1))
            numContigs += len(contigs)
            if nSeqs % 10000 == 0:
                elapsedTime = float((time.time() - startTime) / 60)
                breakerProgress.logger.info("%d processed\t| %s\t | %.2f m" %
                                            (nSeqs, header, elapsedTime))
            fINFO.write(">%s\n" % (header))
            if len(contigs) == 1:
                fINFO.write("%s\tNA\tNA\n" % (contigs[0]))
                continue
            for i in range(1, len(contigs)):
                fINFO.write("%s\t%s\t%d\n" %
                            (contigs[i - 1], contigs[i], gapLens[i - 1]))
        if nSeqs < 10000:
            elapsedTime = float((time.time() - startTime) / 60)
            breakerProgress.logger.info("%d processed\t| %s\t | %.2f m" %
                                        (nSeqs, header, elapsedTime))
        fAGP.close()
        n50 = agSeq.get_assembly_NXX(contigLens)
        breakerProgress.logger.info("Total length of the given assembly: %d" %
                                    (genomeSize))
        breakerProgress.logger.info("Total length of the shred assembly: %d" %
                                    (splitSize))
        breakerProgress.logger.info(
            "Number of sequences in the shred assembly: %d" % (numContigs))
        breakerProgress.logger.info("N50 of the shred assembly: %d" % (n50))
        return dHeader2Intervals
Example #7
0
def shred_annotation(dHeader2Intervals, gffFile, prefix, breakerProgress):
    breakerProgress.logger.info("[BEGIN] Shredding annotation")
    outDebugFile = prefix + ".shred_annotation.debug"
    shredAnnDebug = agLOG.DEBUG("SHREDDER", outDebugFile)
    outGFF = prefix + ".ctg.gff"
    fOUT = open(outGFF, 'w')
    annotationType = [
        "gene", "exon", "CDS", "five_prime_UTR", "three_prime_UTR"
    ]
    n = 1
    preGene = ""
    preStrand = ""
    preSource = ""
    preHeader = ""
    preStart = 0
    preStop = 0
    dAttrs = {}
    dAttrsPre = {}
    features = []
    dFeatures = collections.defaultdict(list)
    nGenes = 0
    nShredGenes = 0
    with open(gffFile, 'r') as fGFF:
        for line in fGFF:
            if line.startswith("##FASTA"):
                break
            if not line.startswith("#"):
                tmp_line = line.strip().split("\t")
                header = tmp_line[0]
                if header in dHeader2Intervals:
                    #intervals = dHeader2Intervals[header]
                    # no cut
                    if len(dHeader2Intervals[header]) == 1:
                        if tmp_line[2] in annotationType:
                            fOUT.write(line)
                            if tmp_line[2] == "gene":
                                nGenes += 1
                                nShredGenes += 1
                    # get cut
                    else:
                        start = int(tmp_line[3])
                        stop = int(tmp_line[4])
                        if tmp_line[2] == "gene":
                            nGenes += 1
                            dAttrs = get_attributes(tmp_line[8])
                            if "ID" in dAttrs:
                                gene = dAttrs["ID"]
                            else:
                                gene = "agouti_shred_gene_%d" % (n)
                                shredAnnDebug.debugger.debug((
                                    "Warning: no gene ID extracted from attribute. "
                                    "Name given: %s" % (gene)))
                                n += 1
                            strand = tmp_line[6]
                            source = tmp_line[1]
                            if preGene == "":
                                preGene = gene
                                preStart = start
                                preStop = stop
                                preStrand = strand
                                preSource = source
                                preHeader = header
                                dAttrsPre = dAttrs
                            else:
                                if preGene != gene:
                                    shredAnnDebug.debugger.debug(
                                        "####%s [BEGIN]" % (preGene))
                                    shredAnnDebug.debugger.debug(
                                        "====geneStart=%d geneStop=%d" %
                                        (preStart, preStop))
                                    # here to get how many intervals a gene spans
                                    shreds = []
                                    intervals = dHeader2Intervals[preHeader]
                                    for i in range(len(intervals)):
                                        interval = intervals[i]
                                        overlap = agDenoise.find_overlap(
                                            interval, (preStart, preStop))
                                        if overlap == 0:
                                            shreds += [(i, interval[0] + 1,
                                                        interval[1] + 1)]
                                    shredAnnDebug.debugger.debug(
                                        "====shreds=%s" % (str(shreds)))
                                    nShredGenes += len(shreds)
                                    shred_gene(shreds, preGene, preStart,
                                               preStop, preStrand, preSource,
                                               preHeader, features, dFeatures,
                                               dAttrsPre, shredAnnDebug, fOUT)
                                    shredAnnDebug.debugger.debug(
                                        "####%s [END]" % (preGene))
                                    preGene = gene
                                    preStart = start
                                    preStop = stop
                                    preStrand = strand
                                    preSource = source
                                    preHeader = header
                                    dFeatures = {k: [] for k in features}
                                    dAttrsPre = dAttrs
                                    features = []
                        elif tmp_line[2] == "exon":
                            if not "exon" in features:
                                features.append("exon")
                                dFeatures["exon"] = [(start, stop)]
                            else:
                                dFeatures["exon"] += [(start, stop)]
                        elif tmp_line[2] == "CDS":
                            if "CDS" not in features:
                                dFeatures["CDS"] = [(start, stop)]
                                features.append("CDS")
                            else:
                                dFeatures["CDS"] += [(start, stop)]
                        elif tmp_line[2] == "five_prime_UTR":
                            if not "five_prime_UTR" in features:
                                features.append("five_prime_UTR")
                                dFeatures["five_prime_UTR"] = [(start, stop)]
                            else:
                                dFeatures["five_prime_UTR"] += [(start, stop)]
                        elif tmp_line[2] == "three_prime_UTR":
                            if not "three_prime_UTR" in features:
                                features.append("three_prime_UTR")
                                dFeatures["three_prime_UTR"] = [(start, stop)]
                            else:
                                dFeatures["three_prime_UTR"] += [(start, stop)]
            else:
                if line.startswith("##gff"):
                    fOUT.write(line)
                elif not line.startswith("##"):
                    fOUT.write(line)
        # dealing with the last gene
        shredAnnDebug.debugger.debug("####%s [BEGIN]" % (preGene))
        shredAnnDebug.debugger.debug("====geneStart=%d geneStop=%d" %
                                     (preStart, preStop))
        shreds = []
        intervals = dHeader2Intervals[preHeader]
        for i in range(len(intervals)):
            interval = intervals[i]
            overlap = agDenoise.find_overlap(interval, (preStart, preStop))
            if overlap == 0:
                shreds += [(i, interval[0] + 1, interval[1] + 1)]
        nShredGenes += len(shreds)
        shred_gene(shreds, preGene, preStart, preStop, preStrand, preSource,
                   preHeader, features, dFeatures, dAttrsPre, shredAnnDebug,
                   fOUT)
        shredAnnDebug.debugger.debug("####%s [END]" % (preGene))

    breakerProgress.logger.info("Number of genes in the give GFF: %d" %
                                (nGenes))
    breakerProgress.logger.info("Number of genes in the shred GFF: %d" %
                                (nShredGenes))
    fOUT.close()
Example #8
0
def get_gene_models(gff, outDir, prefix, debug=0):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_GFFs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)
    progressLogFile = os.path.join(moduleOutDir,
                                   "%s.agouti_gff.progressMeter" % (prefix))
    agGFFProgress = agLOG.PROGRESS_METER(moduleName)
    agGFFProgress.add_file_handler(progressLogFile)
    agGFFProgress.logger.info("[BEGIN] Getting gene models")
    dGFFs = collections.defaultdict(list)
    nGene = 0
    with open(gff, 'r') as fIN:
        for line in fIN:
            if line.startswith("##FASTA") or line.startswith("##Fasta"):
                break
            # skip empty lines and lines starting with '#'
            if not line.startswith('#') and len(line.strip()) > 0:
                tmp_line = line.strip().split("\t")
                if tmp_line[2] == "gene":
                    nGene += 1
        if nGene == 0:
            agGFFProgress.logger.error("Found zero genes")
            agGFFProgress.logger.error("Please check your GFF file")
            sys.exit(1)
        lobj_GeneModels = [AGOUTI_GFF() for i in xrange(nGene)]
        geneIndex = -1
        stop = 0
        fIN.seek(0)
        for line in fIN:
            # Stop before getting into FASTA zone
            if line.startswith("##FASTA") or line.startswith("##Fasta"):
                stop = 1
                break
            # skip empty lines and lines starting with '#'
            if not line.startswith('#') and line.strip():
                tmp_line = line.strip().split("\t")
                if tmp_line[2] == "gene":
                    geneIndex += 1
                    attrs = tmp_line[8].split(';')
                    for attr in attrs:
                        attrID, attrVal = attr.split('=')
                        if attrID == "ID":
                            geneID = attrVal
                            break
                    #m = re.search("(;ID=.+;|ID=.+;|ID=.+|;ID=.+)", tmp_line[8])
                    #print m.group()
                    #geneID = m.group().strip(';').split('=')[1]
                    if geneIndex == 0:
                        #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1],
                        #								   int(tmp_line[3]),
                        #								   int(tmp_line[4]))
                        lobj_GeneModels[geneIndex].setGene(
                            geneID, int(tmp_line[3]), int(tmp_line[4]))
                    else:
                        preCtgID = lobj_GeneModels[geneIndex - 1].ctgID
                        preGeneID = lobj_GeneModels[geneIndex - 1].geneID
                        dGFFs[preCtgID].append(lobj_GeneModels[geneIndex - 1])
                        #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1],
                        #								   int(tmp_line[3]),
                        #								   int(tmp_line[4]))
                        lobj_GeneModels[geneIndex].setGene(
                            geneID, int(tmp_line[3]), int(tmp_line[4]))
                    lobj_GeneModels[geneIndex].setProgram(tmp_line[1])
                    lobj_GeneModels[geneIndex].setContigID(tmp_line[0])
                    lobj_GeneModels[geneIndex].setStrand(tmp_line[6])
                elif tmp_line[2] == "stop_codon":
                    lobj_GeneModels[geneIndex].setStopCodon()
                elif tmp_line[2] == "start_codon":
                    lobj_GeneModels[geneIndex].setStartCodon()
                elif tmp_line[2] == "CDS":
                    lobj_GeneModels[geneIndex].updateCDS(
                        int(tmp_line[3]), int(tmp_line[4]))
        if not stop and geneIndex >= 0:
            dGFFs[lobj_GeneModels[geneIndex].ctgID].append(
                lobj_GeneModels[geneIndex])

    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_gff.debug" % (prefix))
        agGFFDebug = agLOG.DEBUG(moduleName, debugLogFile)
        agGFFDebug.debugger.debug("Sequence\tNum_Gene_Models")

    nGeneModels = 0
    for k, v in sorted(dGFFs.items()):
        genes = [(gene.geneStart, gene.geneStop) for gene in v]
        # make sure gene model are in ascending order
        soGenes = sorted(xrange(len(genes)), key=lambda k: genes[k])
        tmpV = []
        for i in xrange(len(soGenes)):
            index = soGenes[i]
            tmpV.append(v[index])
        dGFFs[k] = tmpV
        nGeneModels += len(tmpV)
        if debug:
            agGFFDebug.debugger.debug("%s\t%d" % (k, len(tmpV)))

    agGFFProgress.logger.info("%d Gene Models parsed" % (nGeneModels))
    agGFFProgress.logger.info("[DONE]")
    return dGFFs