Example #1
0
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(
            taxonomy, namesMap=kwargs.pop(
                'namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
Example #2
0
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(taxonomy,
                                namesMap=kwargs.pop('namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
Example #3
0
def readTaxonomyFiles(options, namesMap=False):
    """
    load the taxonomy specififed by the user. Create a name lookup map if parseStyle is 'orgs'
    """
    # read taxonomy
    if options.taxdir is not None:
        getTaxNames=namesMap or options.parseStyle==ORGS
        taxonomy = readTaxonomy(options.taxdir,namesMap=getTaxNames)
        logging.info ("Read %d nodes from tax dump" % (len(taxonomy.idMap)))
    else:
        taxonomy = None
        if options.countMethod == 'LCA' or options.countMethod == 'rLCA':
            raise Exception('Cannot use LCA without providng a taxonomy (-n)')
        logging.info ("No taxonomy needed")

    return taxonomy
Example #4
0
def readTaxonomyFiles(options, namesMap=False):
    """
    load the taxonomy specififed by the user. Create a name lookup map if
    parseStyle is 'orgs'
    """
    # read taxonomy
    if options.taxdir is not None:
        getTaxNames = namesMap or options.parseStyle == ORGS
        taxonomy = readTaxonomy(options.taxdir, namesMap=getTaxNames)
        logging.info("Read %d nodes from tax dump" % (len(taxonomy.idMap)))
    else:
        taxonomy = None
        if options.countMethod == 'LCA' or options.countMethod == 'rLCA':
            raise Exception('Cannot use LCA without providng a taxonomy (-n)')
        logging.info("No taxonomy needed")

    return taxonomy
def main():
    usage = "usage: %prog OPTIONS JSON_FILE(s)"
    description = __doc__
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addUniversalOptions(parser)

    parser.add_option(
        "-r", "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree"
    )
    parser.add_option(
        "-c",
        "--colors",
        default=None,
        help="Set colors by mapping taxon names to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry.",
    )
    parser.add_option(
        "-C",
        "--cutoff",
        default=0.025,
        type="float",
        help="Trim nodes below this value. Interpreted as an absolute threshold if >1 and as fractional if <1. Set to 0 (or less) to turn off.",
    )

    parser.add_option(
        "-R", "--ranks", default="superkingdom,phylum,family", help="Ranks to inclued in sunburst, default: %default"
    )
    parser.add_option(
        "-n",
        "--ncbiTaxDir",
        dest="taxdir",
        metavar="PATH",
        default=None,
        help="Directory with unpacked ncbi tax dump (specifically names.dmp and nodes.dmp) and use to translate taxids into taxa. ",
    )
    parser.add_option(
        "-i",
        "--icicle",
        default=False,
        action="store_true",
        help="Print stacked bars in rectangular coordinates, not polar.",
    )
    parser.add_option(
        "-e",
        "--exterior_labels",
        default=False,
        action="store_true",
        help="Print labels for outermost nodes outside image",
    )
    parser.add_option(
        "-s",
        "--sortKey",
        default=[],
        action="append",
        choices=[NAME, COLOR, VALUE],
        help='how to sort nodes. Defaults to "color" and "name"',
    )

    parser.add_option(
        "-S", "--figsize", default=None, help="Comma separated pair of numbers (in inches) for figure size"
    )

    parser.add_option(
        "-f",
        "--format",
        dest="format",
        default="pdf",
        choices=["png", "ps", "pdf", "svg"],
        help="Format for output image",
        metavar="FORMAT",
    )
    parser.add_option(
        "-J", "--JSON", default=False, action="store_true", help="output JSON tree of counts instead of figure"
    )

    (options, args) = parser.parse_args()

    # check arguments
    setupLogging(options, description)

    if not options.JSON:
        # setup matplotlib
        backend = options.format
        if backend == "png":
            backend = "agg"
        matplotlib.use(backend)
        import matplotlib.pyplot as plt

    # load taxonomy
    if options.taxdir is None:
        parser.error("You must supply the location of the NCBI tax dump files")
    taxonomy = readTaxonomy(options.taxdir)

    # build rank list
    ranks = options.ranks.split(",")

    if options.JSON:
        # STandard iterator that returns handles
        inputIterator = inputIteratorNormal

        if len(options.sortKey) > 0:
            logger.warn("the SORT option has no effect on JSON output")
    else:
        # version that defaults to adding format as suffix and returns name
        inputIterator = inputIteratorFig

    # proecss user selected options
    kwargs = processOptions(options)

    # process input files
    for (inhandle, outfile) in inputIterator(args, options):
        # load counts
        counts = {}
        for line in inhandle:
            (taxid, count) = line.rstrip("\n\r").split(",")
            if taxid == "None":
                tax = None
            else:
                tax = taxonomy.idMap.get(int(taxid), None)
            counts[tax] = counts.get(tax, 0) + int(count)

        # convert to JSON
        (nxtree, root) = convertToNx(counts, leaves=True, ranks=ranks)
        tree = convertToJSON(nxtree, root)

        # process JSON
        if options.colors is not None:
            setColors(tree, options.colors, **kwargs)
        if options.root is not None:
            newRoot = findNode(tree, options.root, **kwargs)
            if newRoot is not None:
                tree = newRoot

        total = applyCutoff(tree, options.cutoff, **kwargs)
        if options.JSON:
            putNodeCountsInOther(tree)
            outfile.write(json.dumps(tree, indent=2))
        else:
            # some of the matplotlib functions don't like extra arguments
            kwargs.pop(ID)

            # create figure
            plotSunburstJSON(tree, **kwargs)

            # save to file
            plt.savefig(outfile, format=options.format)
def main():
    usage = '%prog [OPTIONS] TAXDUMP_PATH'
    description = 'reduce full RefSea catalog (from STDIN) to acc->taxid map using TAXDUMP to verify taxids'

    parser = OptionParser(usage, description=description)
    parser.add_option(
        "-g",
        "--genomic",
        default=False,
        action="store_true",
        help="output genoic accessions instead of proteins")
    parser.add_option(
        "-v",
        "--verbose",
        action="count",
        dest="verbose",
        default=1,
        help="Print log messages. Use twice for debugging")
    parser.add_option(
        "-q",
        '--quiet',
        dest='verbose',
        action="store_const",
        const=0,
        help="Suppress warnings. Only print fatal messages")
    parser.add_option(
        "-A",
        "--about",
        action="store_true",
        dest="about",
        default=False,
        help="Print description")

    (options, args) = parser.parse_args()

    if options.about:
        print description
        exit(0)

    # check args
    if options.verbose == 0:
        loglevel = logging.ERROR
    elif options.verbose == 1:
        loglevel = logging.WARN
    elif options.verbose == 2:
        loglevel = logging.INFO
    elif options.verbose >= 3:
        loglevel = logging.DEBUG
    logging.basicConfig(stream=sys.stderr, level=loglevel)
    logging.info("Log level set to %r(%d)" % (loglevel, options.verbose))

    if len(args) != 1:
        parser.error("Please supply TAXDUMP path in command line")

    # parse catalog
    accToOrg = {}
    accRE = re.compile(r'\b([A-Z]{2}_[A-Z]*\d+)(\.\b)?\b')
    protRE = re.compile(r'^([ANXWYZ]P_[A-Z]*\d+)$')
    logging.info("reading catalog from STDIN")
    for line in sys.stdin:
        cells = line.rstrip('\r\n').split('\t')
        (taxid, name, acc) = cells[0:3]
        try:
            taxid = int(taxid)
        except:
            pass
        logging.debug(acc)
        acc = accRE.match(acc).group(1)
        logging.debug("'%s'" % acc)
        m = protRE.match(acc)
        if (m == None) == options.genomic:
            # will match if acc matches exp and genomic is false OR
            #            if acc doesn't match and genomic is true
            accToOrg[acc] = (taxid, name)
            logging.debug("USing acc: %s" % (acc))
        else:
            logging.debug("Skipping acc: %s" % (acc))

    # load taxonomy
    logging.info("loading taxonomy from %s" % (args[0]))
    taxonomy = readTaxonomy(args[0], namesMap=True)

    # print table
    changes = 0
    for (acc, (taxid, name)) in accToOrg.iteritems():
        if taxid not in taxonomy.idMap:
            node = getNodeFromHit(name, taxonomy.nameMap)
            logging.debug("Changing %s to %s" % (taxid, node.id))
            taxid = node.id
            changes += 1
        print '\t'.join([acc, str(taxid)])
    logging.info("Changed %d taxon ids" % changes)
Example #7
0
def main():
    usage = '%prog [OPTIONS] TAXDUMP_PATH'
    description = 'reduce full RefSea catalog (from STDIN) to acc->taxid map using TAXDUMP to verify taxids'

    parser = OptionParser(usage, description=description)
    parser.add_option("-g",
                      "--genomic",
                      default=False,
                      action="store_true",
                      help="output genoic accessions instead of proteins")
    parser.add_option("-v",
                      "--verbose",
                      action="count",
                      dest="verbose",
                      default=1,
                      help="Print log messages. Use twice for debugging")
    parser.add_option("-q",
                      '--quiet',
                      dest='verbose',
                      action="store_const",
                      const=0,
                      help="Suppress warnings. Only print fatal messages")
    parser.add_option("-A",
                      "--about",
                      action="store_true",
                      dest="about",
                      default=False,
                      help="Print description")

    (options, args) = parser.parse_args()

    if options.about:
        print(description)
        exit(0)

    # check args
    if options.verbose == 0:
        loglevel = logging.ERROR
    elif options.verbose == 1:
        loglevel = logging.WARN
    elif options.verbose == 2:
        loglevel = logging.INFO
    elif options.verbose >= 3:
        loglevel = logging.DEBUG
    logging.basicConfig(stream=sys.stderr, level=loglevel)
    logging.info("Log level set to %r(%d)" % (loglevel, options.verbose))

    if len(args) != 1:
        parser.error("Please supply TAXDUMP path in command line")

    # parse catalog
    accToOrg = {}
    accRE = re.compile(r'\b([A-Z]{2}_[A-Z]*\d+)(\.\b)?\b')
    protRE = re.compile(r'^([ANXWYZ]P_[A-Z]*\d+)$')
    logging.info("reading catalog from STDIN")
    for line in sys.stdin:
        cells = line.rstrip('\r\n').split('\t')
        (taxid, name, acc) = cells[0:3]
        try:
            taxid = int(taxid)
        except:
            pass
        logging.debug(acc)
        acc = accRE.match(acc).group(1)
        logging.debug("'%s'" % acc)
        m = protRE.match(acc)
        if (m == None) == options.genomic:
            # will match if acc matches exp and genomic is false OR
            #            if acc doesn't match and genomic is true
            accToOrg[acc] = (taxid, name)
            logging.debug("USing acc: %s" % (acc))
        else:
            logging.debug("Skipping acc: %s" % (acc))

    # load taxonomy
    logging.info("loading taxonomy from %s" % (args[0]))
    taxonomy = readTaxonomy(args[0], namesMap=True)

    # print table
    changes = 0
    for (acc, (taxid, name)) in accToOrg.iteritems():
        if taxid not in taxonomy.idMap:
            node = getNodeFromHit(name, taxonomy.nameMap)
            logging.debug("Changing %s to %s" % (taxid, node.id))
            taxid = node.id
            changes += 1
        print('\t'.join([acc, str(taxid)]))
    logging.info("Changed %d taxon ids" % changes)