Ejemplo n.º 1
0
def getLevelMapper(level, arguments):
    if level in koSyns:
        return lambda h: h
    if arguments.heirarchyType == 'cazy':
        return getCazyGroup

    lookupLevel = level if level not in level3Syns else '3'

    if arguments.heirarchyType == 'kegg':
        # Ideally, we'd be able to parse the heirachy once, but the current
        # KEGG code just retuns simple mappings
        logging.info("Reading KEGG level %s assignments from %s" %
                     (level, arguments.heirarchyFile))
        geneTranslation = kegg.parse_KEGG_file(arguments.heirarchyFile,
                                               lookupLevel)
    else:
        # SEED or COG/KOG
        if arguments.heirarchyType == 'seed':
            logging.info("Reading SEED subsystem assignments from %s" %
                         (arguments.heirarchyFile))
            seedTree = kegg.readSEEDTree(arguments.heirarchyFile)
        elif arguments.heirarchyType == 'cog':
            logging.info("Reading COG subsystem assignments from %s" %
                         (arguments.heirarchyFile))
            seedTree = kegg.readCogTree(arguments.heirarchyFile)

        geneTranslation = seedTree[lookupLevel]
    return lambda gene: geneTranslation.get(gene, gene)
Ejemplo n.º 2
0
def getLevelMapper(level, arguments):
    if level in koSyns:
        return lambda h: h
    if arguments.heirarchyType == 'cazy':
        return getCazyGroup

    lookupLevel = level if level not in level3Syns else '3'

    if arguments.heirarchyType == 'kegg':
        # Ideally, we'd be able to parse the heirachy once, but the current
        # KEGG code just retuns simple mappings
        logging.info(
            "Reading KEGG level %s assignments from %s" %
            (level, arguments.heirarchyFile))
        geneTranslation = kegg.readKEGGFile(
            arguments.heirarchyFile, lookupLevel)
    else:
        # SEED or COG/KOG
        if arguments.heirarchyType == 'seed':
            logging.info(
                "Reading SEED subsystem assignments from %s" %
                (arguments.heirarchyFile))
            seedTree = kegg.readSEEDTree(arguments.heirarchyFile)
        elif arguments.heirarchyType == 'cog':
            logging.info(
                "Reading COG subsystem assignments from %s" %
                (arguments.heirarchyFile))
            seedTree = kegg.readCogTree(arguments.heirarchyFile)

        geneTranslation = seedTree[lookupLevel]
    return lambda gene: geneTranslation.get(gene, gene)
Ejemplo n.º 3
0
def printCountTablesByLevel(fileCounts, totals, fileNames, options):
    """
    Create a new file for each level with a tab separated table of counts
    """
    cutoff = options.cutoff

    if options.heirarchyType == 'seed':
        logging.info(
            "Reading SEED subsystem assignments from %s",
            options.heirarchyFile)
        seedTree = kegg.readSEEDTree(options.heirarchyFile)
    elif options.heirarchyType == 'cog':
        logging.info(
            "Reading COG subsystem assignments from %s",
            options.heirarchyFile)
        seedTree = kegg.readCogTree(options.heirarchyFile)

    # create an output table for each requested level
    for level in options.levels:
        logging.debug("Processing level %s", level)
        translateToPaths = level not in koSyns
        descString = None
        if translateToPaths:
            if options.heirarchyType == 'cazy':
                geneTranslator = getCazyGroup
            else:
                lookupLevel = level if level not in level3Syns else '3'

                if options.heirarchyType == 'kegg':
                    # Ideally, we'd be able to parse the heirachy once, but the
                    # current KEGG code just retuns simple mappings
                    logging.info(
                        "Reading KEGG level %s assignments from %s",
                        level, options.heirarchyFile)
                    geneTranslation = kegg.readKEGGFile(
                        options.heirarchyFile, lookupLevel)
                else:
                    # SEED or COG/KOG
                    geneTranslation = seedTree[lookupLevel]
                geneTranslator = dict_lookup_default_to_query(geneTranslation)

        elif level is not None and options.heirarchyType == 'kegg':
            # return descriptions if level explicitly set to ko (or syn.)
            descString = "Description"
            logging.info(
                "Reading KO descriptions from %s",
                options.heirarchyFile)
            geneTranslation = kegg.readKEGGFile(
                options.heirarchyFile, "DESCRIPTION")
            geneTranslator = lambda_get_gene_and_translation(geneTranslation)
        elif level is not None and options.heirarchyType == 'cog':
            # return descriptions if level explicitly set to ko (or syn.)
            descString = "Description\tCategories"
            geneTranslator = lambda_get_seed_translations(seedTree)
        else:
            # just return gene if no level set or not KEGG/COG/KOG
            geneTranslator = passThrough

        # For each level, try to force all counts to be at that level
        fileLevelTotals = {}
        levelCounts = {}
        levelPaths = {}
        thresholds = {}
        for (filename, counts) in fileCounts.items():
            fileLevelTotals[filename] = 0
            thresholds[filename] = totals[filename] * cutoff
            fileLevelCounts = levelCounts.setdefault(filename, {})

            fileTotal = 0
            for gene in sorted(counts.keys(),
                               key=lambda s: "" if s is None else s):
                # get the counts from this node
                geneCount = counts[gene]
                fileTotal += geneCount

                # translate gene to pathway (or not depending on above code)
                pathway = geneTranslator(gene)

                # update counts
                # Some KOs will map to multiple pathways,
                #  so... allow for multiple translated values
                if not(
                    isinstance(
                        pathway,
                        list) or isinstance(
                        pathway,
                        tuple)):
                    pathway = [pathway, ]
                for indPathway in pathway:
                    fileLevelCounts[indPathway] = fileLevelCounts.get(
                        indPathway, 0) + geneCount
                    levelPaths[indPathway] = True

            logging.debug(
                "File %s has %d hits (had %d)",
                filename, fileTotal, totals[filename])

        # logging.debug(repr(levelPaths))
        # logging.debug(repr(levelCounts))
        if logging.getLogger().level <= logging.DEBUG:
            for (filename, counts) in levelCounts.items():
                logging.debug("File %s has %d counts",
                              filename, sum(counts.values()))

        # apply cutoff
        for pathway in list(levelPaths.keys()):
            # check to see if pathway is over cutoff in any file
            over = False
            for (filename, fileLevelCount) in levelCounts.items():
                flPathCount = fileLevelCount.get(pathway, 0)
                fileLevelTotals[filename] += flPathCount
                if flPathCount > thresholds[filename]:
                    over = True
            if not over:
                # this pathway is not over the cutoff for any file
                levelPaths.pop(pathway)
                other = 'Other'
                levelPaths[other] = True
                for (filename, fileLevelCount) in levelCounts.items():
                    fileLevelCount[other] = fileLevelCount.get(
                        other, 0) + fileLevelCount.pop(pathway, 0)

        if logging.getLogger().level <= logging.DEBUG:
            for (filename, counts) in levelCounts.items():
                logging.debug("File %s has %d counts",
                              filename, sum(counts.values()))
                missed = False
                for path in counts.keys():
                    if path not in levelPaths:
                        missed = True
                        logging.debug(
                            "Missing pathway %s has %d counts for %s",
                            path, counts[path], filename)
                if not missed:
                    logging.debug(
                        "There are no missing pathways from %s",
                        filename)

        logging.debug("Final file counts: %r", fileLevelTotals)

        # output file
        if options.output_file is None:
            outs = sys.stdout
        else:
            if len(options.levels) > 1:
                outfile = "%s.%s" % (options.output_file, level)
            else:
                outfile = options.output_file
            outs = open(outfile, 'w')

        # write to file(s?)
        # header
        if level in koSyns:
            # Header for when level is the gene
            if descString is not None:
                outs.write("Gene\t%s\t%s\n" % (descString,
                                               '\t'.join(fileNames)))
            else:
                outs.write("Gene\t%s\n" % ('\t'.join(fileNames)))
        else:
            # Header for when level is a pathway or group
            outs.write("Pathway\t%s\n" % ('\t'.join(fileNames)))

        for pathway in sorted(levelPaths.keys(),
                              key=lambda s: "" if s is None else s):
            outs.write(str(pathway))
            for filename in fileNames:
                outs.write("\t")
                outs.write(str(levelCounts[filename].get(pathway, 0)))
            outs.write("\n")

        # close out stream
        if options.output_file is not None:
            outs.close()
Ejemplo n.º 4
0
def printCountTablesByLevel(fileCounts, totals, fileNames, options):
    """
    Create a new file for each level with a tab separated table of counts
    """
    cutoff = options.cutoff

    if options.heirarchyType == 'seed':
        logging.info("Reading SEED subsystem assignments from %s",
                     options.heirarchyFile)
        seedTree = kegg.readSEEDTree(options.heirarchyFile)
    elif options.heirarchyType == 'cog':
        logging.info("Reading COG subsystem assignments from %s",
                     options.heirarchyFile)
        seedTree = kegg.readCogTree(options.heirarchyFile)

    # create an output table for each requested level
    for level in options.levels:
        logging.debug("Processing level %s", level)
        translateToPaths = level not in koSyns
        descString = None
        if translateToPaths:
            if options.heirarchyType == 'cazy':
                geneTranslator = getCazyGroup
            else:
                lookupLevel = level if level not in level3Syns else '3'

                if options.heirarchyType == 'kegg':
                    # Ideally, we'd be able to parse the heirachy once, but the
                    # current KEGG code just retuns simple mappings
                    logging.info("Reading KEGG level %s assignments from %s",
                                 level, options.heirarchyFile)
                    geneTranslation = kegg.readKEGGFile(
                        options.heirarchyFile, lookupLevel)
                else:
                    # SEED or COG/KOG
                    geneTranslation = seedTree[lookupLevel]
                geneTranslator = dict_lookup_default_to_query(geneTranslation)

        elif level is not None and options.heirarchyType == 'kegg':
            # return descriptions if level explicitly set to ko (or syn.)
            descString = "Description"
            logging.info("Reading KO descriptions from %s",
                         options.heirarchyFile)
            geneTranslation = kegg.readKEGGFile(options.heirarchyFile,
                                                "DESCRIPTION")
            geneTranslator = lambda_get_gene_and_translation(geneTranslation)
        elif level is not None and options.heirarchyType == 'cog':
            # return descriptions if level explicitly set to ko (or syn.)
            descString = "Description\tCategories"
            geneTranslator = lambda_get_seed_translations(seedTree)
        else:
            # just return gene if no level set or not KEGG/COG/KOG
            geneTranslator = passThrough

        # For each level, try to force all counts to be at that level
        fileLevelTotals = {}
        levelCounts = {}
        levelPaths = {}
        thresholds = {}
        for (filename, counts) in fileCounts.items():
            fileLevelTotals[filename] = 0
            thresholds[filename] = totals[filename] * cutoff
            fileLevelCounts = levelCounts.setdefault(filename, {})

            fileTotal = 0
            for gene in sorted(counts.keys(),
                               key=lambda s: "" if s is None else s):
                # get the counts from this node
                geneCount = counts[gene]
                fileTotal += geneCount

                # translate gene to pathway (or not depending on above code)
                pathway = geneTranslator(gene)

                # update counts
                # Some KOs will map to multiple pathways,
                #  so... allow for multiple translated values
                if not (isinstance(pathway, list)
                        or isinstance(pathway, tuple)):
                    pathway = [
                        pathway,
                    ]
                for indPathway in pathway:
                    fileLevelCounts[indPathway] = fileLevelCounts.get(
                        indPathway, 0) + geneCount
                    levelPaths[indPathway] = True

            logging.debug("File %s has %d hits (had %d)", filename, fileTotal,
                          totals[filename])

        # logging.debug(repr(levelPaths))
        # logging.debug(repr(levelCounts))
        if logging.getLogger().level <= logging.DEBUG:
            for (filename, counts) in levelCounts.items():
                logging.debug("File %s has %d counts", filename,
                              sum(counts.values()))

        # apply cutoff
        for pathway in list(levelPaths.keys()):
            # check to see if pathway is over cutoff in any file
            over = False
            for (filename, fileLevelCount) in levelCounts.items():
                flPathCount = fileLevelCount.get(pathway, 0)
                fileLevelTotals[filename] += flPathCount
                if flPathCount > thresholds[filename]:
                    over = True
            if not over:
                # this pathway is not over the cutoff for any file
                levelPaths.pop(pathway)
                other = 'Other'
                levelPaths[other] = True
                for (filename, fileLevelCount) in levelCounts.items():
                    fileLevelCount[other] = fileLevelCount.get(
                        other, 0) + fileLevelCount.pop(pathway, 0)

        if logging.getLogger().level <= logging.DEBUG:
            for (filename, counts) in levelCounts.items():
                logging.debug("File %s has %d counts", filename,
                              sum(counts.values()))
                missed = False
                for path in counts.keys():
                    if path not in levelPaths:
                        missed = True
                        logging.debug(
                            "Missing pathway %s has %d counts for %s", path,
                            counts[path], filename)
                if not missed:
                    logging.debug("There are no missing pathways from %s",
                                  filename)

        logging.debug("Final file counts: %r", fileLevelTotals)

        # output file
        if options.output_file is None:
            outs = sys.stdout
        else:
            if len(options.levels) > 1:
                outfile = "%s.%s" % (options.output_file, level)
            else:
                outfile = options.output_file
            outs = open(outfile, 'w')

        # write to file(s?)
        # header
        if level in koSyns:
            # Header for when level is the gene
            if descString is not None:
                outs.write("Gene\t%s\t%s\n" %
                           (descString, '\t'.join(fileNames)))
            else:
                outs.write("Gene\t%s\n" % ('\t'.join(fileNames)))
        else:
            # Header for when level is a pathway or group
            outs.write("Pathway\t%s\n" % ('\t'.join(fileNames)))

        for pathway in sorted(levelPaths.keys(),
                              key=lambda s: "" if s is None else s):
            outs.write(str(pathway))
            for filename in fileNames:
                outs.write("\t")
                outs.write(str(levelCounts[filename].get(pathway, 0)))
            outs.write("\n")

        # close out stream
        if options.output_file is not None:
            outs.close()