def countHits(infile, **kwargs): """ Count hits from a hit table. Calls edl.hits.parseM8FileIter with the following optional parameters: hitStringMap (None): dictionary (or file) mapping hit IDs to something else format (GENE): hit table format filter_top_pct (0): only consider hits within this % of top score for each read parseStyle (ACCS): how to process hit data into an identifying string countMethod ('all'): how to resolve hits to multiple sequences taxonomy (None): An edl.taxon.Taxonomy object or directory conatining taxdmp rank (None): Maximum rank to resolve hits """ # if taxonomy or hitStringMap are file names, parse them taxonomy = kwargs.pop('taxonomy', None) if isinstance(taxonomy, str): taxonomy = readTaxonomy( taxonomy, namesMap=kwargs.pop( 'namesMap', False)) hitStringMap = kwargs.pop('hitStringMap', None) if isinstance(hitStringMap, str): if taxonomy is not None: # the mapped hit ids will need to be ints valueType = kwargs.pop('valueType', int) else: valueType = kwargs.pop('valueType', None) hitStringMap = parseMapFile(hitStringMap, valueType=valueType) # if infile is name (and not handle), open as a handle if isinstance(infile, str): inhandle = open(infile) else: inhandle = infile # get iterator over reads that will parse hits hitIter = parseM8FileIter(inhandle, hitStringMap, FilterParams( format=kwargs.pop('format', GENE), top_pct=kwargs.pop('filter_top_pct', 0), ), kwargs.pop('parseStyle', ACCS), kwargs.pop('countMethod', 'all'), taxonomy=taxonomy, rank=kwargs.pop('rank', None)) # count the hits (total, counts) = countIterHits(hitIter, allMethod=kwargs.pop('allMethod', ALLEQ), returnMap=False) logger.info("Total hits: %s" % total) if isinstance(infile, str): inhandle.close() return counts
def countHits(infile, **kwargs): """ Count hits from a hit table. Calls edl.hits.parseM8FileIter with the following optional parameters: hitStringMap (None): dictionary (or file) mapping hit IDs to something else format (GENE): hit table format filter_top_pct (0): only consider hits within this % of top score for each read parseStyle (ACCS): how to process hit data into an identifying string countMethod ('all'): how to resolve hits to multiple sequences taxonomy (None): An edl.taxon.Taxonomy object or directory conatining taxdmp rank (None): Maximum rank to resolve hits """ # if taxonomy or hitStringMap are file names, parse them taxonomy = kwargs.pop('taxonomy', None) if isinstance(taxonomy, str): taxonomy = readTaxonomy(taxonomy, namesMap=kwargs.pop('namesMap', False)) hitStringMap = kwargs.pop('hitStringMap', None) if isinstance(hitStringMap, str): if taxonomy is not None: # the mapped hit ids will need to be ints valueType = kwargs.pop('valueType', int) else: valueType = kwargs.pop('valueType', None) hitStringMap = parseMapFile(hitStringMap, valueType=valueType) # if infile is name (and not handle), open as a handle if isinstance(infile, str): inhandle = open(infile) else: inhandle = infile # get iterator over reads that will parse hits hitIter = parseM8FileIter(inhandle, hitStringMap, FilterParams( format=kwargs.pop('format', GENE), top_pct=kwargs.pop('filter_top_pct', 0), ), kwargs.pop('parseStyle', ACCS), kwargs.pop('countMethod', 'all'), taxonomy=taxonomy, rank=kwargs.pop('rank', None)) # count the hits (total, counts) = countIterHits(hitIter, allMethod=kwargs.pop('allMethod', ALLEQ), returnMap=False) logger.info("Total hits: %s" % total) if isinstance(infile, str): inhandle.close() return counts
def readTaxonomyFiles(options, namesMap=False): """ load the taxonomy specififed by the user. Create a name lookup map if parseStyle is 'orgs' """ # read taxonomy if options.taxdir is not None: getTaxNames=namesMap or options.parseStyle==ORGS taxonomy = readTaxonomy(options.taxdir,namesMap=getTaxNames) logging.info ("Read %d nodes from tax dump" % (len(taxonomy.idMap))) else: taxonomy = None if options.countMethod == 'LCA' or options.countMethod == 'rLCA': raise Exception('Cannot use LCA without providng a taxonomy (-n)') logging.info ("No taxonomy needed") return taxonomy
def readTaxonomyFiles(options, namesMap=False): """ load the taxonomy specififed by the user. Create a name lookup map if parseStyle is 'orgs' """ # read taxonomy if options.taxdir is not None: getTaxNames = namesMap or options.parseStyle == ORGS taxonomy = readTaxonomy(options.taxdir, namesMap=getTaxNames) logging.info("Read %d nodes from tax dump" % (len(taxonomy.idMap))) else: taxonomy = None if options.countMethod == 'LCA' or options.countMethod == 'rLCA': raise Exception('Cannot use LCA without providng a taxonomy (-n)') logging.info("No taxonomy needed") return taxonomy
def main(): usage = "usage: %prog OPTIONS JSON_FILE(s)" description = __doc__ parser = OptionParser(usage, description=description) addIOOptions(parser) addUniversalOptions(parser) parser.add_option( "-r", "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree" ) parser.add_option( "-c", "--colors", default=None, help="Set colors by mapping taxon names to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry.", ) parser.add_option( "-C", "--cutoff", default=0.025, type="float", help="Trim nodes below this value. Interpreted as an absolute threshold if >1 and as fractional if <1. Set to 0 (or less) to turn off.", ) parser.add_option( "-R", "--ranks", default="superkingdom,phylum,family", help="Ranks to inclued in sunburst, default: %default" ) parser.add_option( "-n", "--ncbiTaxDir", dest="taxdir", metavar="PATH", default=None, help="Directory with unpacked ncbi tax dump (specifically names.dmp and nodes.dmp) and use to translate taxids into taxa. ", ) parser.add_option( "-i", "--icicle", default=False, action="store_true", help="Print stacked bars in rectangular coordinates, not polar.", ) parser.add_option( "-e", "--exterior_labels", default=False, action="store_true", help="Print labels for outermost nodes outside image", ) parser.add_option( "-s", "--sortKey", default=[], action="append", choices=[NAME, COLOR, VALUE], help='how to sort nodes. Defaults to "color" and "name"', ) parser.add_option( "-S", "--figsize", default=None, help="Comma separated pair of numbers (in inches) for figure size" ) parser.add_option( "-f", "--format", dest="format", default="pdf", choices=["png", "ps", "pdf", "svg"], help="Format for output image", metavar="FORMAT", ) parser.add_option( "-J", "--JSON", default=False, action="store_true", help="output JSON tree of counts instead of figure" ) (options, args) = parser.parse_args() # check arguments setupLogging(options, description) if not options.JSON: # setup matplotlib backend = options.format if backend == "png": backend = "agg" matplotlib.use(backend) import matplotlib.pyplot as plt # load taxonomy if options.taxdir is None: parser.error("You must supply the location of the NCBI tax dump files") taxonomy = readTaxonomy(options.taxdir) # build rank list ranks = options.ranks.split(",") if options.JSON: # STandard iterator that returns handles inputIterator = inputIteratorNormal if len(options.sortKey) > 0: logger.warn("the SORT option has no effect on JSON output") else: # version that defaults to adding format as suffix and returns name inputIterator = inputIteratorFig # proecss user selected options kwargs = processOptions(options) # process input files for (inhandle, outfile) in inputIterator(args, options): # load counts counts = {} for line in inhandle: (taxid, count) = line.rstrip("\n\r").split(",") if taxid == "None": tax = None else: tax = taxonomy.idMap.get(int(taxid), None) counts[tax] = counts.get(tax, 0) + int(count) # convert to JSON (nxtree, root) = convertToNx(counts, leaves=True, ranks=ranks) tree = convertToJSON(nxtree, root) # process JSON if options.colors is not None: setColors(tree, options.colors, **kwargs) if options.root is not None: newRoot = findNode(tree, options.root, **kwargs) if newRoot is not None: tree = newRoot total = applyCutoff(tree, options.cutoff, **kwargs) if options.JSON: putNodeCountsInOther(tree) outfile.write(json.dumps(tree, indent=2)) else: # some of the matplotlib functions don't like extra arguments kwargs.pop(ID) # create figure plotSunburstJSON(tree, **kwargs) # save to file plt.savefig(outfile, format=options.format)
def main(): usage = '%prog [OPTIONS] TAXDUMP_PATH' description = 'reduce full RefSea catalog (from STDIN) to acc->taxid map using TAXDUMP to verify taxids' parser = OptionParser(usage, description=description) parser.add_option( "-g", "--genomic", default=False, action="store_true", help="output genoic accessions instead of proteins") parser.add_option( "-v", "--verbose", action="count", dest="verbose", default=1, help="Print log messages. Use twice for debugging") parser.add_option( "-q", '--quiet', dest='verbose', action="store_const", const=0, help="Suppress warnings. Only print fatal messages") parser.add_option( "-A", "--about", action="store_true", dest="about", default=False, help="Print description") (options, args) = parser.parse_args() if options.about: print description exit(0) # check args if options.verbose == 0: loglevel = logging.ERROR elif options.verbose == 1: loglevel = logging.WARN elif options.verbose == 2: loglevel = logging.INFO elif options.verbose >= 3: loglevel = logging.DEBUG logging.basicConfig(stream=sys.stderr, level=loglevel) logging.info("Log level set to %r(%d)" % (loglevel, options.verbose)) if len(args) != 1: parser.error("Please supply TAXDUMP path in command line") # parse catalog accToOrg = {} accRE = re.compile(r'\b([A-Z]{2}_[A-Z]*\d+)(\.\b)?\b') protRE = re.compile(r'^([ANXWYZ]P_[A-Z]*\d+)$') logging.info("reading catalog from STDIN") for line in sys.stdin: cells = line.rstrip('\r\n').split('\t') (taxid, name, acc) = cells[0:3] try: taxid = int(taxid) except: pass logging.debug(acc) acc = accRE.match(acc).group(1) logging.debug("'%s'" % acc) m = protRE.match(acc) if (m == None) == options.genomic: # will match if acc matches exp and genomic is false OR # if acc doesn't match and genomic is true accToOrg[acc] = (taxid, name) logging.debug("USing acc: %s" % (acc)) else: logging.debug("Skipping acc: %s" % (acc)) # load taxonomy logging.info("loading taxonomy from %s" % (args[0])) taxonomy = readTaxonomy(args[0], namesMap=True) # print table changes = 0 for (acc, (taxid, name)) in accToOrg.iteritems(): if taxid not in taxonomy.idMap: node = getNodeFromHit(name, taxonomy.nameMap) logging.debug("Changing %s to %s" % (taxid, node.id)) taxid = node.id changes += 1 print '\t'.join([acc, str(taxid)]) logging.info("Changed %d taxon ids" % changes)
def main(): usage = '%prog [OPTIONS] TAXDUMP_PATH' description = 'reduce full RefSea catalog (from STDIN) to acc->taxid map using TAXDUMP to verify taxids' parser = OptionParser(usage, description=description) parser.add_option("-g", "--genomic", default=False, action="store_true", help="output genoic accessions instead of proteins") parser.add_option("-v", "--verbose", action="count", dest="verbose", default=1, help="Print log messages. Use twice for debugging") parser.add_option("-q", '--quiet', dest='verbose', action="store_const", const=0, help="Suppress warnings. Only print fatal messages") parser.add_option("-A", "--about", action="store_true", dest="about", default=False, help="Print description") (options, args) = parser.parse_args() if options.about: print(description) exit(0) # check args if options.verbose == 0: loglevel = logging.ERROR elif options.verbose == 1: loglevel = logging.WARN elif options.verbose == 2: loglevel = logging.INFO elif options.verbose >= 3: loglevel = logging.DEBUG logging.basicConfig(stream=sys.stderr, level=loglevel) logging.info("Log level set to %r(%d)" % (loglevel, options.verbose)) if len(args) != 1: parser.error("Please supply TAXDUMP path in command line") # parse catalog accToOrg = {} accRE = re.compile(r'\b([A-Z]{2}_[A-Z]*\d+)(\.\b)?\b') protRE = re.compile(r'^([ANXWYZ]P_[A-Z]*\d+)$') logging.info("reading catalog from STDIN") for line in sys.stdin: cells = line.rstrip('\r\n').split('\t') (taxid, name, acc) = cells[0:3] try: taxid = int(taxid) except: pass logging.debug(acc) acc = accRE.match(acc).group(1) logging.debug("'%s'" % acc) m = protRE.match(acc) if (m == None) == options.genomic: # will match if acc matches exp and genomic is false OR # if acc doesn't match and genomic is true accToOrg[acc] = (taxid, name) logging.debug("USing acc: %s" % (acc)) else: logging.debug("Skipping acc: %s" % (acc)) # load taxonomy logging.info("loading taxonomy from %s" % (args[0])) taxonomy = readTaxonomy(args[0], namesMap=True) # print table changes = 0 for (acc, (taxid, name)) in accToOrg.iteritems(): if taxid not in taxonomy.idMap: node = getNodeFromHit(name, taxonomy.nameMap) logging.debug("Changing %s to %s" % (taxid, node.id)) taxid = node.id changes += 1 print('\t'.join([acc, str(taxid)])) logging.info("Changed %d taxon ids" % changes)