Esempio n. 1
0
def agouti_path_main(agoutiPaths, dSenses, vertex2Name, dGFFs,
                     dCtgPair2GenePair, oriScafPathFile, outDir, prefix):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_path")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)
    agPathProgress = agLOG.PROGRESS_METER(moduleName)
    agPathProgress.logger.info("Analyzing scaffolding paths")
    outDebugFile = os.path.join(moduleOutDir, prefix) + ".agouti_path.debug"
    agPathDebug = agLOG.DEBUG("SHREDDER", outDebugFile)
    agPathProgress.logger.info("[BEGIN] Reading file with shred info")
    dOriPaths, dOriGaps = read_original_path(oriScafPathFile, agPathProgress)
    agPathProgress.logger.info("[DONE]")

    # shut it off for now; working to improve it
    #agPathProgress.logger.info("[BEGIN] Checking consistency")
    #compare(dOriPaths, agoutiPaths, vertex2Name, outDir, prefix)
    #agPathProgress.logger.info("[DONE]")

    report_consistency(agoutiPaths, dOriPaths, vertex2Name, outDir, prefix)

    agPathProgress.logger.info("[BEGIN] Recovring original scaffolding")
    agoutiPaths, dCtgPair2GenePair, dSenses = recover_untouched_sequences(
        dOriPaths, agoutiPaths, vertex2Name, dGFFs, dCtgPair2GenePair, dSenses,
        agPathProgress, agPathDebug)
    agPathProgress.logger.info("[DONE]")

    return agoutiPaths, dCtgPair2GenePair, dSenses
Esempio n. 2
0
def update_local(args):
    '''
		update to latest version
	'''
    version = agLOG.PROGRESS_METER("UPDATE")
    repoDir = os.path.dirname(os.path.realpath(__file__))
    # first check git availability
    checkGitVersion = "git --version"
    p = sp.Popen(shlex.split(checkGitVersion), stdout=sp.PIPE, stderr=sp.PIPE)
    pout, perr = p.communicate()
    if p.returncode:
        version.logger.info("Please check your PATH for git")
        version.logger.info("Update unsuccessful")
        sys.exit(1)
    # Then compare local with remote
    version.logger.info("Checking available updates of AGOUTI")
    checkLocal = "git log -n 1 --pretty=\"%%H\""
    localVersion = sp.check_output(shlex.split(checkLocal),
                                   cwd=repoDir).strip()
    #	if remoteVersion != localVersion:
    gitCmd = "git ls-remote origin"
    heads = sp.check_output(shlex.split(gitCmd), cwd=repoDir).split("\n")
    tags = []
    dVersions = {}
    for line in heads:
        if line:
            tmpLine = line.strip().split("\t")
            if re.search("refs/tag", tmpLine[1]):
                if re.search("\^\{\}$", tmpLine[1]):
                    dVersions[tmpLine[1].strip("^{}")] = tmpLine[0]
                    continue
                else:
                    dVersions[tmpLine[1]] = ""
                tags.append(tmpLine[1])
    latesTag = sorted(tags)[-1]
    latestHash = dVersions[latesTag]
    if latestHash != localVersion:
        gitCmd = "git fetch --all"
        p = sp.Popen(shlex.split(gitCmd),
                     stdout=sp.PIPE,
                     stderr=sp.PIPE,
                     cwd=repoDir)
        pout, perr = p.communicate()
        if p.returncode:
            version.logger.error("git fetch error: %s" % (perr))
            sys.exit(1)
        gitCmd = "git checkout -q %s -b %s" % (latesTag,
                                               latesTag.split("/")[-1])
        p = sp.Popen(shlex.split(gitCmd),
                     stdout=sp.PIPE,
                     stderr=sp.PIPE,
                     cwd=repoDir)
        pout, perr = p.communicate()
        if p.returncode:
            version.logger.error("git checkout error: %s" % (perr))
            sys.exit(1)
        version.logger.info("Update successful")
        sys.exit(0)
    version.logger.info("Current version is the LATEST. No need to update")
Esempio n. 3
0
def agouti_shred_main(assemblyFile, gffFile, prefix, minGaps, minCtgLen):
    breakerProgress = agLOG.PROGRESS_METER("SHREDDER")
    breakerProgress.logger.info("[BEGIN] Shredding assembly")
    outdir = os.path.dirname(os.path.realpath(prefix))
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    dHeader2Intervals = shred_assembly(assemblyFile, breakerProgress, prefix,
                                       minGaps, minCtgLen)
    if gffFile:
        shred_annotation(dHeader2Intervals, gffFile, prefix, breakerProgress)
Esempio n. 4
0
def run_scaffolder(args):
    bamFile = args.bamFile
    gffFile = os.path.realpath(args.gff)
    prefix = args.prefix
    outDir = os.path.realpath(args.outDir)
    if not os.path.exists(outDir):
        os.makedirs(outDir)

    paraLogFile = os.path.join(outDir, "%s.parameters.txt" % (args.prefix))
    para = agLOG.PROGRESS_METER(parse_args.__name__)
    para.add_file_handler(paraLogFile)
    para.logger.info("Assembly: %s" % (os.path.realpath(args.assemblyFile)))
    para.logger.info("Gene Model: %s" % (gffFile))
    if args.oriScafPath:
        para.logger.info("Original scaffold path: %s" % (args.oriScafPath))
    para.logger.info("Output directory: %s" % (outDir))
    para.logger.info("Output prefix: %s" % (prefix))
    para.logger.info("Minimum number of supports: %d" % (args.minSupport))
    para.logger.info("Length of gaps to fill between contigs: %d" %
                     (args.nFills))

    vertex2Name, dSeqs = agSeq.agouti_seq_main(args.assemblyFile, outDir,
                                               prefix, args.debug)

    dGFFs = agGFF.get_gene_models(gffFile, outDir, prefix, args.debug)

    dContigPairs = agBAM.agouti_sam_main(bamFile, outDir, prefix,
                                         args.overwrite, args.minMQ,
                                         args.minFracOvl, args.maxFracMM,
                                         args.debug)

    dCtgPair2GenePair, dCtgPairDenoise = agDENOISE.denoise_joining_pairs(
        dContigPairs, dGFFs, vertex2Name, outDir, prefix, args.minSupport,
        args.debug)

    agoutiPaths, dSenses = agSCAFF.run_scaffolding(vertex2Name,
                                                   dCtgPairDenoise,
                                                   dCtgPair2GenePair, outDir,
                                                   prefix, args.minSupport,
                                                   args.debug)

    if args.oriScafPath:
        agoutiPaths, dCtgPair2GenePair, dSenses = agPATH.agouti_path_main(
            agoutiPaths, dSenses, vertex2Name, dGFFs, dCtgPair2GenePair,
            args.oriScafPath, outDir, prefix)

    agUPDATE.agouti_update(agoutiPaths, dSeqs, vertex2Name, dSenses, dGFFs,
                           dCtgPair2GenePair, outDir, prefix, args.nFills,
                           args.debug, args.no_update_gff)

    para.logger.info("Peak memory use: %.5f GB" %
                     (resource.getrusage(resource.RUSAGE_SELF).ru_maxrss /
                      (1024 * 1024)))
Esempio n. 5
0
def agouti_seq_main(assemblyFile, outDir, prefix, debug=0):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_seq")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)
    progressLogFile = os.path.join(moduleOutDir,
                                   "%s.agouti_seq.progressMeter" % (prefix))
    agSeqProgress = agLOG.PROGRESS_METER(moduleName)
    agSeqProgress.add_file_handler(progressLogFile)
    #contigs, dSeqs = get_contigs(assemblyFile, agSeqProgress)
    agSeqProgress.logger.info("[BEGIN] Reading the initial assembly")
    dSeqs = {}
    dHeaders = {}
    contigs = [
        "NONE"
    ]  # add dummny contig to make sure contig index staring with 1, not 0
    seqLens = []
    seqIndex = 1
    for header, seq in read_fasta(assemblyFile):
        # split header on any non-alphabetic character
        # use only the first of the return list
        #header = re.split("\W+", header)[0]
        if header not in dHeaders:
            contigs.append(header)
            dSeqs[seqIndex] = seq
            seqIndex += 1
            dHeaders[header] = 1
            seqLens.append(len(seq))
        else:
            agSeqProgress.logger.error("AGOUTI found DUPLICATED header: %s" %
                                       (header))
            agSeqProgress.logger.error("QUIT")
            sys.exit(1)

    n50 = get_assembly_NXX(seqLens)

    agSeqProgress.logger.info("%d sequences parsed" % (len(dSeqs)))
    agSeqProgress.logger.info("The given assembly N50: %d" % (n50))
    agSeqProgress.logger.info("[DONE]")
    return contigs, dSeqs
Esempio n. 6
0
def agouti_update(agoutiPaths,
                  dSeqs,
                  vertex2Name,
                  dSenses,
                  dGFFs,
                  dCtgPair2GenePair,
                  outDir,
                  prefix,
                  nFills=1000,
                  debug=0,
                  no_update_gff=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_update")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(moduleOutDir,
                                   "%s.agouti_update.progressMeter" % (prefix))
    global agUPDATEProgress
    agUPDATEProgress = agLOG.PROGRESS_METER(moduleName)
    agUPDATEProgress.add_file_handler(progressLogFile)
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_update.debug" % (prefix))
        global agUPDATEDebug
        agUPDATEDebug = agLOG.DEBUG(moduleName, debugLogFile)

    if not no_update_gff:
        agUPDATEProgress.logger.info("[BEGIN] Updating gene models")

    outFasta = os.path.join(outDir, "%s.agouti.fasta" % (prefix))
    fFASTA = open(outFasta, 'w')
    dUpdateGFFs = collections.defaultdict(list)
    dMergedGene2Ctgs = collections.defaultdict(list)
    dMergedGene2Genes = collections.defaultdict(list)
    scafPaths = []
    numMergedGene = 0
    nCtgScaffolded = 0
    scaffoldedCtgs = {}
    seqLens = []
    dScafGaps = {}
    dScafStats = {}
    scafID = 0
    mergedGenes = []
    for i in range(len(agoutiPaths)):
        path = agoutiPaths[i]
        scafID += 1
        scafName = prefix + "_scaf_%d" % (scafID)
        dScafStats[scafName] = 0
        dScafGaps[scafName] = []

        curVertex = path[0]
        sequence = dSeqs[curVertex]
        curSense = "+"
        curCtg = vertex2Name[curVertex]
        preCtg = ""
        scafPath = [curVertex]
        preGeneID, curGeneID = "", ""
        mergedGene = agGFF.AGOUTI_GFF()
        preMergedGene = agGFF.AGOUTI_GFF()
        gapStart, gapStop = 0, 0
        offset = 0
        orientation = ""
        updatedGeneIDs = []
        mergedGenesPerPath = []
        excludeGeneIDs = []
        for nextVertex in path[1:]:
            nextCtg = vertex2Name[nextVertex]

            if preCtg == "":
                if debug:
                    agUPDATEDebug.debugger.debug(
                        "UPDATE_MAIN\t>scaf_%d - path - %s" %
                        (scafID, str([vertex2Name[vertex]
                                      for vertex in path])))
            if debug:
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tcurVertex - %d - %s - nextVertex - %d - %s"
                    % (curVertex, curCtg, nextVertex, nextCtg))

            if not no_update_gff:
                #curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair, curCtg, nextCtg)
                curGene, nextGene = ctgpair2genepair(dCtgPair2GenePair,
                                                     curVertex, nextVertex)
                #!!! I should not break here, should continue#
                if curGene is None and nextGene is None:
                    agUPDATEProgress.logger.error(
                        "%s - %s found no gene models joining them" %
                        (curCtg, nextCtg))
                    agUPDATEProgress.logger.error(
                        "This is NOT EXPECTED, REPORT!")
                    sys.exit(1)
                curGeneID = curGene.geneID
                excludeGeneIDs = [preGeneID] + [curGeneID]
                if debug:
                    agUPDATEDebug.debugger.debug(
                        "UPDATE_MAIN\t\tpreGene - %s - curGene - %s - nextGene - %s"
                        % (preGeneID, curGene.geneID, nextGene.geneID))

            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafName - %s" %
                                             (scafName))
            FR, FF, RR, RF = get_orientation_counts(curVertex, nextVertex,
                                                    dSenses)
            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" %
                                             (curSense))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" %
                    (FR, FF, RF, RR))
            if curSense == "-":
                temp1 = FR
                temp2 = FF
                FR = RR
                FF = RF
                RR = temp1
                RF = temp2
            orientation = decide_orientation(FR, FF, RR, RF)

            gapStart = gapStop + len(dSeqs[curVertex])
            gapStop = gapStart + nFills - 1
            dScafGaps[scafName].append((gapStart + 1, gapStop + 1))
            if debug:
                agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tcurSense=%s" %
                                             (curSense))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tFR=%d - FF=%d - RF=%d - RR=%d" %
                    (FR, FF, RF, RR))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\toffset - %d - curCtgLen - %d" %
                    (offset, len(dSeqs[curVertex])))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tgapstart - %d - gapstop - %d" %
                    (gapStart, gapStop + 1))
            valid = 0
            if orientation == "FR":
                if not no_update_gff:
                    if curGeneID != preGeneID:
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        #if curGene.geneStop != 0:
                        #	dMergedGene2Genes[mergedGene.geneID] += [curGeneID]
                        #if nextGene.geneStop != 0:
                        #	dMergedGene2Genes[mergedGene.geneID] += [nextGene.geneID]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        if nextGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                nextGene.geneID
                            ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence += 'N' * nFills + dSeqs[nextVertex]
                scafPath += [nextVertex]
                curSense = "+"
            elif orientation == "FF":
                if not no_update_gff:
                    #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug)
                    dGFFs[nextCtg] = reverse_gene_models(
                        dGFFs[nextCtg], len(dSeqs[nextVertex]), debug)
                    if curGeneID != preGeneID:
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence += 'N' * nFills + agSeq.rc_seq(dSeqs[nextVertex])
                scafPath += [-1 * nextVertex]
                curSense = "-"
            elif orientation == "RR":
                if not no_update_gff:
                    if curGene.geneID != preGeneID:
                        dGFFs[curCtg] = reverse_gene_models(
                            dGFFs[curCtg], len(dSeqs[curVertex]), debug)
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug)
                        dUpdateGFFs[scafName] = reverse_gene_models(
                            dUpdateGFFs[scafName], gapStart - 1, debug)
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence = agSeq.rc_seq(sequence) + \
                     'N'*nFills + dSeqs[nextVertex]
                scafPath[-1] = -1 * scafPath[-1]
                scafPath += [nextVertex]
                curSense = "+"
            elif orientation == "RF":
                if not no_update_gff:
                    dGFFs[nextCtg] = reverse_gene_models(
                        dGFFs[nextCtg], len(dSeqs[nextVertex]), debug)
                    if curGene.geneID != preGeneID:
                        dGFFs[curCtg] = reverse_gene_models(
                            dGFFs[curCtg], len(dSeqs[curVertex]), debug)
                        #nextGene = reverse_gene_model(nextGene, len(dSeqs[nextVertex]), debug)
                        numMergedGene += 1
                        mergedGene = merge_gene_model(curGene, nextGene,
                                                      scafName, numMergedGene,
                                                      offset, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [
                            curCtg, nextCtg
                        ]
                        if mergedGene.geneStop != 0:
                            dMergedGene2Genes[mergedGene.geneID] += [
                                curGeneID, nextGene.geneID
                            ]
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug, mergedGene)
                    else:
                        dUpdateGFFs[
                            scafName], updatedGeneIDs = update_gene_model(
                                dGFFs[curCtg], dUpdateGFFs[scafName], scafName,
                                offset, excludeGeneIDs, debug)
                        dUpdateGFFs[scafName] = reverse_gene_models(
                            dUpdateGFFs[scafName],
                            gapStop + len(dSeqs[curVertex]), debug)
                        mergedGene = merge_gene_model(preMergedGene, nextGene,
                                                      scafName, numMergedGene,
                                                      0, gapStop, debug)
                        dMergedGene2Ctgs[mergedGene.geneID] += [nextCtg]
                        dMergedGene2Genes[mergedGene.geneID] += [
                            nextGene.geneID
                        ]
                        indexMerged = updatedGeneIDs.index(mergedGene.geneID)
                        dUpdateGFFs[scafName][indexMerged] = mergedGene
                    preMergedGene = mergedGene
                sequence = agSeq.rc_seq(sequence) + \
                     'N'*nFills + \
                     agSeq.rc_seq(dSeqs[nextVertex])
                scafPath[-1] = -1 * scafPath[-1]
                scafPath += [-1 * nextVertex]
                curSense = "-"
            if debug:
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tscafPath in vertices updates- %s" %
                    (str(scafPath)))
                agUPDATEDebug.debugger.debug(
                    "UPDATE_MAIN\t\tdMergedGene2Gene - %s" %
                    (str(dMergedGene2Genes[mergedGene.geneID])))
            if not no_update_gff:
                mergedGenesPerPath.append(mergedGene.geneID)
                preGeneID = nextGene.geneID
            offset = gapStop
            preCtg = curCtg
            curVertex = nextVertex
            curCtg = vertex2Name[curVertex]

        for i in range(len(scafPath)):
            v = scafPath[i]
            if v < 0:
                scafPath[i] = "-" + vertex2Name[-1 * v]
            else:
                scafPath[i] = vertex2Name[v]
        scafPaths += [scafPath]
        if debug:
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tscafPath in human-readable updates- %s" %
                (str(scafPath)))
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tappend last curCtg - %s" % (curCtg))
            agUPDATEDebug.debugger.debug("UPDATE_MAIN\t\tscafPath - %s" %
                                         (str(scafPath)))
        if not no_update_gff:
            excludeGeneIDs = [preGeneID]
            mergedGenes.append(mergedGenesPerPath)
            dUpdateGFFs[scafName], updatedGeneIDs = update_gene_model(
                dGFFs[curCtg], dUpdateGFFs[scafName], scafName, offset,
                excludeGeneIDs, debug)
        fFASTA.write(">%s |%dbp |%s\n%s\n" %
                     (scafName, len(sequence), ",".join(scafPath), sequence))
        dScafStats[scafName] = len(sequence)
        seqLens.append(len(sequence))
        #agPaths.append(scafPath)
        nCtgScaffolded += len(scafPath)
        scaffoldedCtgs.update(dict((contig, 1) for contig in scafPath))
        if debug:
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t\tmergedGenesPerPath - %s" %
                (str(mergedGenesPerPath)))
            agUPDATEDebug.debugger.debug(
                "UPDATE_MAIN\t-------------------------------------")

    agPATH.report_scaffold_path(scafPaths, vertex2Name, outDir, prefix)

    # other contigs need to be output
    agUPDATEProgress.logger.info("Finalizing sequences")
    for vertex in dSeqs:
        if vertex2Name[vertex] in scaffoldedCtgs or "-" + vertex2Name[
                vertex] in scaffoldedCtgs:
            continue
        fFASTA.write(">%s\n%s\n" % (vertex2Name[vertex], dSeqs[vertex]))
        dScafStats[vertex2Name[vertex]] = len(dSeqs[vertex])
        seqLens.append(len(dSeqs[vertex]))
    fFASTA.close()
    n50 = agSeq.get_assembly_NXX(seqLens)

    agUPDATEProgress.logger.info("Outputting updated Gene Moddels")
    for vertex in dSeqs:
        if vertex2Name[vertex] in scaffoldedCtgs:
            if vertex2Name[vertex] in dGFFs:
                del dGFFs[vertex2Name[vertex]]
    if not no_update_gff:
        dFinalGFFs = dict(dGFFs, **dUpdateGFFs)
        numGenes = output_gff(dFinalGFFs, dMergedGene2Ctgs, dMergedGene2Genes,
                              dScafStats, dScafGaps, outDir, prefix)
        agUPDATEProgress.logger.info("Summarizing AGOUTI gene paths")
        summarize_gene_path(dMergedGene2Genes, dMergedGene2Ctgs, outDir,
                            prefix)

    agUPDATEProgress.logger.info("-----------Summary-----------")
    agUPDATEProgress.logger.info("number of contigs scaffoled: %d" %
                                 (nCtgScaffolded))
    agUPDATEProgress.logger.info("number of scaffolds: %d" % (scafID))
    agUPDATEProgress.logger.info(
        "number of contigs in the final assembly: %d" % (len(seqLens)))
    agUPDATEProgress.logger.info("Final assembly N50: %d" % (n50))
    if not no_update_gff:
        agUPDATEProgress.logger.info("Final number of genes: %d" % (numGenes))
    agUPDATEProgress.logger.info("Succeeded")
Esempio n. 7
0
def denoise_joining_pairs(dContigPairs,
                          dGFFs,
                          vertex2Name,
                          outDir,
                          prefix,
                          minSupport,
                          debug=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_denoise")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_denoise.progressMeter" % (prefix))
    agDENOISEProgress = agLOG.PROGRESS_METER(moduleName)
    agDENOISEProgress.add_file_handler(progressLogFile)

    debugLogFile = ""
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_denoise.debug" % (prefix))
        global agDENOISEDebug
        agDENOISEDebug = agLOG.DEBUG(moduleName, debugLogFile)

    agDENOISEProgress.logger.info("[BEGIN] Denoising joining pairs")
    startTime = time.clock()
    dCtgPair2GenePair = collections.defaultdict()
    dCtgPairDenoise = collections.defaultdict()
    dMappedPos = collections.defaultdict()
    daddedModels = collections.defaultdict(list)
    nFail4Combination = 0
    nFailGeneModel = 0
    nFailK = 0
    outDenoiseJPFile = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.noise_free.txt" % (prefix))
    fOUT = open(outDenoiseJPFile, 'w')
    for ctgPair, pairInfo in dContigPairs.items():
        if len(pairInfo) < minSupport:
            nFailK += 1
            del dContigPairs[ctgPair]
            continue
        ctgA = ctgPair[0]
        ctgB = ctgPair[1]
        if debug:
            agDENOISEDebug.debugger.debug(
                "DENOISE_MAIN\t>contigA - %s - contigB - %s" % (ctgA, ctgB))
        pairToRemove = []
        mapIntervalsA = []
        mapIntervalsB = []
        pairs = []
        senses = []
        keep = 0
        for i in xrange(len(pairInfo)):
            startA, startB, stopA, stopB, senseA, senseB, readID = pairInfo[i]
            mapIntervalsA += [(startA, stopA)]
            mapIntervalsB += [(startB, stopB)]
            pairs += [(startA, stopA, startB, stopB)]
            senses += [(senseA, senseB)]
        genePair = get_genePair_for_contigPair(dGFFs, ctgA, ctgB,
                                               mapIntervalsA, mapIntervalsB,
                                               senses, debug)
        geneModelsA = dGFFs[ctgA]
        geneModelsB = dGFFs[ctgB]
        if genePair is None:
            nFailGeneModel += 1
            if debug:
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tFail to find a pair of gene models")
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\t----------------------------------")
        else:
            geneIndexA, geneIndexB, endA, endB, intervalsA, intervalsB, senses = genePair
            sensesCounter = collections.Counter(senses)
            if debug:
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tsensesCounter: %s" % (str(sensesCounter)))
            if geneIndexB != 0:
                # create gene model according to endB using intervalsB
                if geneIndexB == -1 and (endB == 5 or endB == 0):
                    dGFFs[ctgB] = create_fake_genes(geneModelsB, 0, ctgB,
                                                    intervalsB, debug)
                    geneIndexB = 0
                    endB = 5
                elif geneIndexB == 1 and (endB == 3 or endB == 0):
                    dGFFs[ctgB] = create_fake_genes(geneModelsB,
                                                    len(geneModelsB), ctgB,
                                                    intervalsB, debug)
                    geneIndexB = len(dGFFs[ctgB]) - 1
                    endB = 3
            else:
                if endB == 0:
                    endB = 5
                elif endB == 3:
                    geneIndexB = len(dGFFs[ctgB]) - 1
            if geneIndexA != 0:
                # create gene model according to endA using intervalsA
                if geneIndexA == -1 and (endA == 5 or endA == 0):
                    dGFFs[ctgA] = create_fake_genes(geneModelsA, 0, ctgA,
                                                    intervalsA, debug)
                    geneIndexA = 0
                    endA = 5
                elif geneIndexA == 1 and (endA == 3 or endA == 0):
                    dGFFs[ctgA] = create_fake_genes(geneModelsA,
                                                    len(geneModelsA), ctgA,
                                                    intervalsA, debug)
                    geneIndexA = len(dGFFs[ctgA]) - 1
                    endA = 3
            else:
                if endA == 0:
                    endA = 3
                elif endA == 3:
                    geneIndexA = len(dGFFs[ctgA]) - 1
            if debug:
                agDENOISEDebug.debugger.debug("DENOISE_MAIN\tgenePair: %s" %
                                              (str(genePair)))
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\t# models on ctgA - %d - # models on ctgB - %d"
                    % (len(dGFFs[ctgA]), len(dGFFs[ctgB])))
                agDENOISEDebug.debugger.debug(
                    "DENOISE_MAIN\tgeneIndexA - %d - endA - %d - geneIndexB - %d - endB - %d"
                    % (geneIndexA, endA, geneIndexB, endB))
            sense = sorted(sensesCounter.items(),
                           key=operator.itemgetter(1),
                           reverse=True)[0][0]
            if debug:
                agDENOISEDebug.debugger.debug("DENOISE_MAIN\tsensePair - %s" %
                                              (str(sense)))
            if (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \
               (geneIndexB == 0 and endB == 5) and sense == ('+', '-'):
                # FR + 3'-5'
                keep = 1
            elif (geneIndexA == 0 and endA == 5) and \
              (geneIndexB == 0 and endB == 5) and sense == ('-', '-'):
                # RR + 5'-5'
                keep = 1
            elif (geneIndexA == len(dGFFs[ctgA])-1 and endA == 3) and \
              (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \
              sense == ('+', '+'):
                # FF + 3'-3'
                keep = 1
            elif (geneIndexA == 0 and endA == 5) and \
              (geneIndexB == len(dGFFs[ctgB])-1 and endB == 3) and \
              sense == ('-', '+'):
                # RF + 5'-3'
                keep = 1
            elif (geneIndexA == 0 and (endA == 0 or endA == 3)) and \
              (geneIndexB == 0 and (endB == 0 or endB == 5)) and \
              sense == ('+', '-'):
                # only one gene on the contig
                # it doesn't matter which end
                keep = 1
            if keep:
                geneA = dGFFs[ctgA][geneIndexA]
                geneB = dGFFs[ctgB][geneIndexB]
                dCtgPair2GenePair[vertex2Name.index(ctgA),
                                  vertex2Name.index(ctgB)] = [geneA, geneB]
                if debug:
                    agDENOISEDebug.debugger.debug("DENOISE_MAIN\tNOISE-FREE")
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\tgeneA ID - %s - startA - %d - stopA = %d"
                        % (geneA.geneID, geneA.geneStart, geneA.geneStop))
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\tgeneB ID - %s - startB - %d - stopB = %d"
                        % (geneB.geneID, geneB.geneStart, geneB.geneStop))
                    agDENOISEDebug.debugger.debug(
                        "DENOISE_MAIN\t----------------------------------")
                senseA = sense[0]
                senseB = sense[1]
                weight = 0
                for i in xrange(len(pairInfo)):
                    startA, startB, stopA, stopB, _, _, readID = pairInfo[i]
                    intervalA = (startA, stopA)
                    intervalB = (startB, stopB)
                    #print "intervalA", intervalA, "intervalB", intervalB
                    if len(intervalsA) == 0:
                        if len(intervalsB) == 0:
                            #print "use all"
                            fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                       (readID, ctgA, startA, senseA, ctgB,
                                        startB, senseB))
                            weight += 1
                        else:
                            #print "use all A, not all B"
                            overlap = find_overlap(
                                intervalB, (geneB.geneStart, geneB.geneStop))
                            if overlap == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                    else:
                        if len(intervalsB) == 0:
                            #print "use all B, not all A"
                            overlap = find_overlap(
                                intervalA, (geneA.geneStart, geneA.geneStop))
                            if overlap == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                        else:
                            #print "not all Both"
                            overlapA = find_overlap(
                                intervalA, (geneA.geneStart, geneA.geneStop))
                            overlapB = find_overlap(
                                intervalB, (geneB.geneStart, geneB.geneStop))
                            if overlapA == 0 and overlapB == 0:
                                fOUT.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" %
                                           (readID, ctgA, startA, senseA, ctgB,
                                            startB, senseB))
                                weight += 1
                dCtgPairDenoise[vertex2Name.index(ctgA),
                                vertex2Name.index(ctgB)] = [
                                    weight, (senseA, senseB)
                                ]
            else:
                nFail4Combination += 1
#			if len(sensesCounter) == 1:
#				sense = sensesCounter.keys()[0]
#			else:
#				print "multiple sense pairs"
#				senses = sorted(sensesCounter.items(), key=operator.itemgetter(1), reverse=True)[0:2]
#				print "senses", senses
#				ratio = float(senses[0][1])/(senses[0][1]+senses[1][1])
#				print "ratio", ratio
    fOUT.close()
    agDENOISEProgress.logger.info("Succeeded")
    agDENOISEProgress.logger.info("Denoise took in %.2f min CPU time" %
                                  ((time.clock() - startTime) / 60))
    agDENOISEProgress.logger.info(
        "%d contig pairs filtered for spanning across >1 gene models" %
        (nFailGeneModel))
    agDENOISEProgress.logger.info(
        "%d contig pairs filtered for not being one of the four combinations" %
        (nFail4Combination))
    agDENOISEProgress.logger.info("%d contig pairs filtered for less support" %
                                  (nFailK))
    agDENOISEProgress.logger.info("%d contig pairs for scaffolding" %
                                  (len(dCtgPairDenoise)))
    return dCtgPair2GenePair, dCtgPairDenoise
Esempio n. 8
0
def get_joining_pairs(bamStream,
                      outDir,
                      prefix,
                      overwrite,
                      minMapQ=5,
                      minFracOvl=0.0,
                      maxFracMismatch=1.0,
                      debug=0):

    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_join_pairs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix))
    agBAMOutAllJoinPairs = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix))
    agBAMProgress = agLOG.PROGRESS_METER(moduleName)
    if not os.path.exists(progressLogFile):
        agBAMProgress.add_file_handler(progressLogFile)
        agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
    else:
        if not overwrite:
            agBAMProgress.add_file_handler(progressLogFile, 'a')
            dContigPairs = retrieve_joininng_pairs(agBAMProgress,
                                                   agBAMOutAllJoinPairs)
            if dContigPairs is not None:
                return dContigPairs
            else:
                agBAMProgress.logger.info(
                    "Fail to pick up results from the previous run")
                agBAMProgress.logger.info("Re-processing the BAM file")
        else:
            agBAMProgress.add_file_handler(progressLogFile)
            agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
            agBAMProgress.logger.info(
                "Overwrite results from the previous run")

    agBAMDebug = None
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_join_pairs.debug" % (prefix))
        agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile)

    with open(agBAMOutAllJoinPairs, 'w') as fOUT:
        agBAMProgress.logger.info(
            "# processed\t| Current Reads ID\t| Elapsed Time")
        if debug:
            agBAMDebug.debugger.debug(
                "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB"
            )
        startTime = time.time()
        dContigPairs = collections.defaultdict(list)
        nJoinPairs = 0
        nReadsPairs = 0
        while True:
            pairA = bamStream.readline().strip().split("\t")
            pairB = bamStream.readline().strip().split("\t")
            # reach the end of the file
            if len(pairA) == 1 or len(pairB) == 1:
                break
            readsID = pairA[0]
            contigA = pairA[2]
            contigB = pairB[2]
            nReadsPairs += 1
            if pairA[0] == pairB[0] and contigA != contigB:
                alnLenA = getCIGAR(pairA[5])
                alnLenB = getCIGAR(pairB[5])
                leftMostPosA = int(pairA[3])
                leftMostPosB = int(pairB[3])
                readLenA = len(pairA[9])
                readLenB = len(pairB[9])
                nMismatchesA = getMismatches(pairA[11:])
                nMismatchesB = getMismatches(pairB[11:])
                mapQA = int(pairA[4])
                mapQB = int(pairB[4])
                flagsA = explainSAMFlag(int(pairA[1]))
                flagsB = explainSAMFlag(int(pairB[1]))
                senseA = flagsA[4]
                senseB = flagsB[4]
                if debug:
                    agBAMDebug.debugger.debug(
                        "%s\t%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d" %
                        (readsID, contigA + ":" + str(leftMostPosA),
                         contigB + ":" + str(leftMostPosB), mapQA, mapQB,
                         senseA, senseB, readLenA, readLenB))

                if (min(alnLenA / readLenA, alnLenB / readLenB) >= minFracOvl
                        and  # minimum fraction of overlaps
                        max(nMismatchesA / alnLenA, nMismatchesB / alnLenB) <=
                        maxFracMismatch and  # maximum fraction of mismatches
                        min(mapQA,
                            mapQB) >= minMapQ):  # minimum mapping quality
                    startA = leftMostPosA + 1
                    stopA = startA + 1 + int(alnLenA)
                    startB = leftMostPosB + 1
                    stopB = startB + 1 + int(alnLenB)
                    nJoinPairs += 1
                    if contigA <= contigB:
                        if (contigA, contigB) not in dContigPairs:
                            dContigPairs[contigA, contigB] = [
                                (startA, startB, stopA, stopB, senseA, senseB,
                                 readsID)
                            ]
                        else:
                            dContigPairs[contigA, contigB] += [
                                (startA, startB, stopA, stopB, senseA, senseB,
                                 readsID)
                            ]
                        fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                   (readsID, contigA, startA, stopA, senseA,
                                    contigB, startB, stopB, senseB))
                    else:
                        if (contigB, contigA) not in dContigPairs:
                            dContigPairs[contigB, contigA] = [
                                (startB, startA, stopB, stopA, senseB, senseA,
                                 readsID)
                            ]
                        else:
                            dContigPairs[contigB, contigA] += [
                                (startB, startA, stopB, stopA, senseB, senseA,
                                 readsID)
                            ]
                        fOUT.write("%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                   (readsID, contigB, startB, stopB, senseB,
                                    contigA, startA, stopA, senseA))
            if nReadsPairs % 5000000 == 0:
                elapsedTime = float((time.time() - startTime) / 60)
                agBAMProgress.logger.info("%d parsed\t| %s\t| %.2f m" %
                                          (nReadsPairs, readsID, elapsedTime))

    agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs))
    agBAMProgress.logger.info("%d contig pairs given by these joining pairs" %
                              (len(dContigPairs)))
    if nJoinPairs == 0:
        agBAMProgress.logger.error("No joining pairs extracted")
        agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs")
        sys.exit(1)
    else:
        agBAMProgress.logger.info("Succeeded")
    return dContigPairs
Esempio n. 9
0
def agouti_sam_main(bamFile,
                    outDir,
                    prefix,
                    overwrite,
                    minMapQ,
                    minFracOvl,
                    maxFracMismatch,
                    debug=0):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_join_pairs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)

    progressLogFile = os.path.join(
        moduleOutDir, "%s.agouti_join_pairs.progressMeter" % (prefix))
    agBAMOutAllJoinPairs = os.path.join(
        moduleOutDir, "%s.agouti.join_pairs.all.txt" % (prefix))
    agBAMProgress = agLOG.PROGRESS_METER(moduleName)
    if not os.path.exists(progressLogFile):
        agBAMProgress.add_file_handler(progressLogFile)
        agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
    else:
        if not overwrite:
            agBAMProgress.add_file_handler(progressLogFile, 'a')
            dContigPairs = retrieve_joininng_pairs(agBAMProgress,
                                                   agBAMOutAllJoinPairs)
            if dContigPairs is not None:
                return dContigPairs
            else:
                agBAMProgress.logger.info(
                    "Fail to pick up results from the previous run")
                agBAMProgress.logger.info("Re-processing the BAM file")
        else:
            agBAMProgress.add_file_handler(progressLogFile)
            agBAMProgress.logger.info("[BEGIN] Identifying joining pairs")
            agBAMProgress.logger.info(
                "Overwrite results from the previous run")

    agBAMDebug = None
    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_join_pairs.debug" % (prefix))
        agBAMDebug = agLOG.DEBUG(moduleName, debugLogFile)

    # before running samtools, check its availability
    agBAMProgress.logger.info("check SAMtools")
    check_samtools(agBAMProgress)

    # runing samtools
    agBAMProgress.logger.info("run SAMtools")

    try:
        with open(agBAMOutAllJoinPairs, 'w') as fOUT:
            agBAMProgress.logger.info(
                "# processed\t| Current Reads ID\t| Elapsed Time")
            if debug:
                agBAMDebug.debugger.debug(
                    "Reads_ID\tLocationA\tLocationB\tmapQA\tmapQB\tsenseA\tsenseB\treadLenA\treadLenB"
                )
            startTime = time.time()
            dContigPairs = collections.defaultdict(list)
            nJoinPairs = 0
            nReadsPairs = 0
            for record in run_samtools(bamFile, agBAMProgress):
                tmpRecord = record.split("\n")
                pairA = tmpRecord[0].split("\t")
                pairB = tmpRecord[1].split("\t")
                readsID = pairA[0]
                contigA = pairA[2]
                contigB = pairB[2]
                mateCtgB = pairA[6]
                mateCtgA = pairB[6]
                nReadsPairs += 1
                # the first contidition makes sure
                # single end BAM are gonna have zero
                # joining-pairs extracted
                if contigA == "*" or contigB == "*":
                    continue
                if pairA[0] == pairB[0] and contigA != contigB:
                    alnLenA = getCIGAR(pairA[5])
                    alnLenB = getCIGAR(pairB[5])
                    leftMostPosA = int(pairA[3])  # 1-based in SAM
                    leftMostPosB = int(pairB[3])
                    readLenA = len(pairA[9])
                    readLenB = len(pairB[9])
                    nMismatchesA = getMismatches(pairA[11:])
                    nMismatchesB = getMismatches(pairB[11:])
                    mapQA = int(pairA[4])
                    mapQB = int(pairB[4])
                    flagsA = explainSAMFlag(int(pairA[1]))
                    flagsB = explainSAMFlag(int(pairB[1]))
                    senseA = flagsA[4]
                    senseB = flagsB[4]
                    if debug:
                        agBAMDebug.debugger.debug(
                            "%s\t%s\t%s\t%d\t%d\t%d\t%d\t%s\t%s\t%d\t%d" %
                            (readsID, contigA + ":" + str(leftMostPosA),
                             contigB + ":" + str(leftMostPosB), int(alnLenA),
                             int(alnLenB), mapQA, mapQB, senseA, senseB,
                             readLenA, readLenB))

                    fracOvlA = alnLenA / readLenA
                    fracOvlB = alnLenB / readLenB
                    fracMismatchA = nMismatchesA / alnLenA
                    fracMismatchB = nMismatchesB / alnLenB
                    if (min(fracOvlA, fracOvlB) >= minFracOvl
                            and  # minimum fraction of overlaps
                            max(fracMismatchA,
                                fracMismatchB) <= maxFracMismatch
                            and  # maximum fraction of mismatches
                            min(mapQA,
                                mapQB) >= minMapQ):  # minimum mapping quality
                        startA = leftMostPosA
                        stopA = startA + int(alnLenA) - 1
                        startB = leftMostPosB
                        stopB = startB + int(alnLenB) - 1
                        nJoinPairs += 1
                        if contigA <= contigB:
                            if (contigA, contigB) not in dContigPairs:
                                dContigPairs[contigA, contigB] = [
                                    (startA, startB, stopA, stopB, senseA,
                                     senseB, readsID)
                                ]
                            else:
                                dContigPairs[contigA, contigB] += [
                                    (startA, startB, stopA, stopB, senseA,
                                     senseB, readsID)
                                ]
                            fOUT.write(
                                "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                (readsID, contigA, startA, stopA, senseA,
                                 contigB, startB, stopB, senseB))
                        else:
                            if (contigB, contigA) not in dContigPairs:
                                dContigPairs[contigB, contigA] = [
                                    (startB, startA, stopB, stopA, senseB,
                                     senseA, readsID)
                                ]
                            else:
                                dContigPairs[contigB, contigA] += [
                                    (startB, startA, stopB, stopA, senseB,
                                     senseA, readsID)
                                ]
                            fOUT.write(
                                "%s\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n" %
                                (readsID, contigB, startB, stopB, senseB,
                                 contigA, startA, stopA, senseA))
                if nReadsPairs % 5000000 == 0:
                    elapsedTime = float((time.time() - startTime) / 60)
                    agBAMProgress.logger.info(
                        "%d parsed\t| %s\t| %.2f m" %
                        (nReadsPairs, readsID, elapsedTime))
    except KeyboardInterrupt:
        agBAMProgress.logger.info(
            "Extract Joining-pairs INTERRUPTED by Keyboard")
        sys.exit(1)

    agBAMProgress.logger.info("%d reads pairs in the give BAM" % (nReadsPairs))
    agBAMProgress.logger.info("%d joining pairs parsed" % (nJoinPairs))
    agBAMProgress.logger.info("%d contig pairs given by these joining pairs" %
                              (len(dContigPairs)))
    if nJoinPairs == 0:
        agBAMProgress.logger.error("No joining pairs extracted")
        agBAMProgress.logger.error("Cannot SCAFFOLD without joining-pairs")
        sys.exit(1)
    else:
        agBAMProgress.logger.info("Succeeded")
    return dContigPairs
Esempio n. 10
0
def get_gene_models(gff, outDir, prefix, debug=0):
    moduleName = os.path.basename(__file__).split('.')[0].upper()
    moduleOutDir = os.path.join(outDir, "agouti_GFFs")
    if not os.path.exists(moduleOutDir):
        os.makedirs(moduleOutDir)
    progressLogFile = os.path.join(moduleOutDir,
                                   "%s.agouti_gff.progressMeter" % (prefix))
    agGFFProgress = agLOG.PROGRESS_METER(moduleName)
    agGFFProgress.add_file_handler(progressLogFile)
    agGFFProgress.logger.info("[BEGIN] Getting gene models")
    dGFFs = collections.defaultdict(list)
    nGene = 0
    with open(gff, 'r') as fIN:
        for line in fIN:
            if line.startswith("##FASTA") or line.startswith("##Fasta"):
                break
            # skip empty lines and lines starting with '#'
            if not line.startswith('#') and len(line.strip()) > 0:
                tmp_line = line.strip().split("\t")
                if tmp_line[2] == "gene":
                    nGene += 1
        if nGene == 0:
            agGFFProgress.logger.error("Found zero genes")
            agGFFProgress.logger.error("Please check your GFF file")
            sys.exit(1)
        lobj_GeneModels = [AGOUTI_GFF() for i in xrange(nGene)]
        geneIndex = -1
        stop = 0
        fIN.seek(0)
        for line in fIN:
            # Stop before getting into FASTA zone
            if line.startswith("##FASTA") or line.startswith("##Fasta"):
                stop = 1
                break
            # skip empty lines and lines starting with '#'
            if not line.startswith('#') and line.strip():
                tmp_line = line.strip().split("\t")
                if tmp_line[2] == "gene":
                    geneIndex += 1
                    attrs = tmp_line[8].split(';')
                    for attr in attrs:
                        attrID, attrVal = attr.split('=')
                        if attrID == "ID":
                            geneID = attrVal
                            break
                    #m = re.search("(;ID=.+;|ID=.+;|ID=.+|;ID=.+)", tmp_line[8])
                    #print m.group()
                    #geneID = m.group().strip(';').split('=')[1]
                    if geneIndex == 0:
                        #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1],
                        #								   int(tmp_line[3]),
                        #								   int(tmp_line[4]))
                        lobj_GeneModels[geneIndex].setGene(
                            geneID, int(tmp_line[3]), int(tmp_line[4]))
                    else:
                        preCtgID = lobj_GeneModels[geneIndex - 1].ctgID
                        preGeneID = lobj_GeneModels[geneIndex - 1].geneID
                        dGFFs[preCtgID].append(lobj_GeneModels[geneIndex - 1])
                        #lobj_GeneModels[geneIndex].setGene(tmp_line[8].split('=')[1],
                        #								   int(tmp_line[3]),
                        #								   int(tmp_line[4]))
                        lobj_GeneModels[geneIndex].setGene(
                            geneID, int(tmp_line[3]), int(tmp_line[4]))
                    lobj_GeneModels[geneIndex].setProgram(tmp_line[1])
                    lobj_GeneModels[geneIndex].setContigID(tmp_line[0])
                    lobj_GeneModels[geneIndex].setStrand(tmp_line[6])
                elif tmp_line[2] == "stop_codon":
                    lobj_GeneModels[geneIndex].setStopCodon()
                elif tmp_line[2] == "start_codon":
                    lobj_GeneModels[geneIndex].setStartCodon()
                elif tmp_line[2] == "CDS":
                    lobj_GeneModels[geneIndex].updateCDS(
                        int(tmp_line[3]), int(tmp_line[4]))
        if not stop and geneIndex >= 0:
            dGFFs[lobj_GeneModels[geneIndex].ctgID].append(
                lobj_GeneModels[geneIndex])

    if debug:
        debugLogFile = os.path.join(moduleOutDir,
                                    "%s.agouti_gff.debug" % (prefix))
        agGFFDebug = agLOG.DEBUG(moduleName, debugLogFile)
        agGFFDebug.debugger.debug("Sequence\tNum_Gene_Models")

    nGeneModels = 0
    for k, v in sorted(dGFFs.items()):
        genes = [(gene.geneStart, gene.geneStop) for gene in v]
        # make sure gene model are in ascending order
        soGenes = sorted(xrange(len(genes)), key=lambda k: genes[k])
        tmpV = []
        for i in xrange(len(soGenes)):
            index = soGenes[i]
            tmpV.append(v[index])
        dGFFs[k] = tmpV
        nGeneModels += len(tmpV)
        if debug:
            agGFFDebug.debugger.debug("%s\t%d" % (k, len(tmpV)))

    agGFFProgress.logger.info("%d Gene Models parsed" % (nGeneModels))
    agGFFProgress.logger.info("[DONE]")
    return dGFFs