def main(): usage = "usage: %prog OPTIONS JSON_FILE(s)" description = __doc__ parser = OptionParser(usage, description=description) addIOOptions(parser) addUniversalOptions(parser) parser.add_option( "-r", "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree" ) parser.add_option( "-c", "--colors", default=None, help="Set colors by mapping taxon names to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry.", ) parser.add_option( "-C", "--cutoff", default=0.025, type="float", help="Trim nodes below this value. Interpreted as an absolute threshold if >1 and as fractional if <1. Set to 0 (or less) to turn off.", ) parser.add_option( "-R", "--ranks", default="superkingdom,phylum,family", help="Ranks to inclued in sunburst, default: %default" ) parser.add_option( "-n", "--ncbiTaxDir", dest="taxdir", metavar="PATH", default=None, help="Directory with unpacked ncbi tax dump (specifically names.dmp and nodes.dmp) and use to translate taxids into taxa. ", ) parser.add_option( "-i", "--icicle", default=False, action="store_true", help="Print stacked bars in rectangular coordinates, not polar.", ) parser.add_option( "-e", "--exterior_labels", default=False, action="store_true", help="Print labels for outermost nodes outside image", ) parser.add_option( "-s", "--sortKey", default=[], action="append", choices=[NAME, COLOR, VALUE], help='how to sort nodes. Defaults to "color" and "name"', ) parser.add_option( "-S", "--figsize", default=None, help="Comma separated pair of numbers (in inches) for figure size" ) parser.add_option( "-f", "--format", dest="format", default="pdf", choices=["png", "ps", "pdf", "svg"], help="Format for output image", metavar="FORMAT", ) parser.add_option( "-J", "--JSON", default=False, action="store_true", help="output JSON tree of counts instead of figure" ) (options, args) = parser.parse_args() # check arguments setupLogging(options, description) if not options.JSON: # setup matplotlib backend = options.format if backend == "png": backend = "agg" matplotlib.use(backend) import matplotlib.pyplot as plt # load taxonomy if options.taxdir is None: parser.error("You must supply the location of the NCBI tax dump files") taxonomy = readTaxonomy(options.taxdir) # build rank list ranks = options.ranks.split(",") if options.JSON: # STandard iterator that returns handles inputIterator = inputIteratorNormal if len(options.sortKey) > 0: logger.warn("the SORT option has no effect on JSON output") else: # version that defaults to adding format as suffix and returns name inputIterator = inputIteratorFig # proecss user selected options kwargs = processOptions(options) # process input files for (inhandle, outfile) in inputIterator(args, options): # load counts counts = {} for line in inhandle: (taxid, count) = line.rstrip("\n\r").split(",") if taxid == "None": tax = None else: tax = taxonomy.idMap.get(int(taxid), None) counts[tax] = counts.get(tax, 0) + int(count) # convert to JSON (nxtree, root) = convertToNx(counts, leaves=True, ranks=ranks) tree = convertToJSON(nxtree, root) # process JSON if options.colors is not None: setColors(tree, options.colors, **kwargs) if options.root is not None: newRoot = findNode(tree, options.root, **kwargs) if newRoot is not None: tree = newRoot total = applyCutoff(tree, options.cutoff, **kwargs) if options.JSON: putNodeCountsInOther(tree) outfile.write(json.dumps(tree, indent=2)) else: # some of the matplotlib functions don't like extra arguments kwargs.pop(ID) # create figure plotSunburstJSON(tree, **kwargs) # save to file plt.savefig(outfile, format=options.format)
def main(): usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES" description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = OptionParser(usage, description=description) addIOOptions(parser) addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None}) parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append', help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.") parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.") addUniversalOptions(parser) parser.add_option('-t','--topHitPct', default=0, type='float', help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.') parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.") parser.add_option('-r','--reads', default=False, action="store_true", help="Output just read names. By default, print the relevant hit lines for each read") (options, args) = parser.parse_args() if options.about: print description exit(0) # check args setupLogging(options,description) if options.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if options.taxdir is not None: taxonomy = readTaxonomy(options.taxdir, namesMap=True) else: taxonomy = None group1Map=getGroupMap(options.group1,taxonomy) group2Map=getGroupMap(options.group2,taxonomy) logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False))) if group2Map is not None: logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False))) # map reads to hits if options.parseStyle==GIS: keyType=int else: keyType=None accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType) # set up some function pointers global hitRE hitRE=parsingREs.get(options.parseStyle,None) if options.parseStyle == ORGS: getTaxid=_getOrgTaxid elif options.parseStyle == HITID: getTaxid=_getHitidTaxid elif options.parseStyle == HITDESC: getTaxid=_getHitdescTaxid else: getTaxid=_getExprTaxid # for filtering: filterParams = FilterParams.createFromOptions(options) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle,outhandle) in inputIterator(args,options): readCount=0 goodReadCount=0 printCount=0 # parse file for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False): readCount+=1 bestScore=0 hitTaxids={} for hit in hits: score=hit.score taxids=[] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit,accToTaxMap,taxonomy): if taxid is None: break if group2Map is not None and not group2Map.get(taxid,False): break taxids.append(taxid) if len(taxids)==0: # nothing matched in the wider group break hitTaxids[hit]=taxids # find the top score if score>bestScore: bestScore=score else: # if we get here, then every hit was in wider taxon list logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore)) all=True recognized=[] for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct): if _anyTaxidInGroup(taxids,group1Map): logging.debug("%s (%r) is in group 1" % (hit,taxids)) recognized.append(hit) else: logging.debug("%s (%r) is not in group 1" % (hit,taxids)) all=False if len(recognized)==0: # if none of the best are in our target list, next read logging.debug("No best hits for %s are in group 1" % (read)) continue if (not options.any) and (not all): # next read unless user said any or all hits are in list logging.debug("Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount+=1 if options.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write('\n') else: logging.debug("Keeping %d hits for %s" % (len(recognized),read)) for hit in sorted(recognized,key=lambda h: (h.score,h.hit)): outhandle.write(hit.getLine(filterParams)) printCount+=1 if options.reads: logging.info("Printed %d of %d reads" % (goodReadCount,readCount)) else: logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))
def main(): usage = "usage: %prog [OPTIONS] INPUT_FILE(S)" description = """ Takes an m8 blast and assigns each read to a pathway or gene family. Blast may be specified with -i or piped to STDIN. """ parser = OptionParser(usage, description=description) parser.add_option("-i", "--inputfile", dest="infile", metavar="INFILE", help="Read data table from INFILE"), addIOOptions(parser) parser.add_option('-O', "--outputStyle", default="cols", choices=['cols','lines','python'], help="How are multiple assignments displayed in output. By default ('cols'), multiple hits show up in multiple columns. The 'lines' option prints out a new line for each assignment. The 'python' option prints each assignment as a python string (in quotes) or a list of strings (in quotes, separted by commas, surrounded bya pair of sqaure brackets).") parser.add_option("-m", "--mapFile", dest="mapFile", metavar="MAPFILE", help="Location of file containing table of with db hit name as first column and geneIDs (Knumber) in second column.") parser.add_option("-M", "--mapStyle", default='auto', choices=['auto','kegg','tab'], help="What type of mapping file are you using: simple tab separated list of IDs and kos, or the genes_ko.list file from KEGG (which adds ko: to the K numbers and can have multiple records for each gene id). By default, this script will inspect the file name and guess, but you can force either 'kegg' or 'tab' with this option.") parser.add_option("-p", "--parseStyle", default=KEGG, choices=[ACCS,GIS,KEGG,HITID,HITDESC], help="What should be parsed from the hit table: accessions('accs'), 'gis', K numbers in description ('kegg'), the full hit name('hitid'), or the full hit description('hitdesc'). (defaults to '%default')") parser.add_option("-c", "--cutoff", dest="cutoff", type="float", default=0.01, help="Cutoff for showing paths or genes. If a fractional count for a path/gene is below this value, it will be labelled None.", metavar="CUTOFF") # format and filterPct addHitTableOptions(parser) parser.add_option("-C", "--countMethod", dest="countMethod", default="all", choices=('first','most','all','consensus'), help="How to deal with assignments from multiple hits. (first, most: can return multiple hits, all (default): return every hit, consensus: return None unless all the same)", metavar="COUNTMETHOD") parser.add_option("-r","--filterForKO",action="store_true", dest="koHitsOnly", default=False, help="ignore hits with no KO assignment. This means reads with no hits to KO tagged sequences will not be in the output.") parser.add_option("-l","--level", dest="level", default="ko", choices=('ko','NAME','DEFINITION','EC','PATHWAY','1','2','3'), help="Either 'ko'; a string to look for in ko file ('PATHWAY','NAME', 'DEFINITION', or 'EC'); or level in kegg class heirarchy (1, 2, or 3 (should be same as PATHWAY))") parser.add_option("-k", "--koFile", dest="ko", metavar="KOFILE", default=None, help="File containing kegg heirarchy (either ko or ko00001.keg)") addUniversalOptions(parser) (options, args) = parser.parse_args() setupLogging(options, description) if options.infile is None: infile = sys.stdin else: infile = open(options.infile) if options.parseStyle==KEGG: if options.mapFile is not None: logging.warn("Do you REALLY want to apply a mapping to KOs?") if options.level != 'ko': if options.ko is None: options.error("Please supply KEGG file if sepcifying a level other than 'ko' ") # read KEGG file koTranslation = readKEGGFile(options.ko, options.level) else: koTranslation = None # map reads to hits if options.mapFile is not None: if options.mapStyle=='kegg' or ( options.mapStyle=='auto' and len(options.mapFile)>=13 and options.mapFile[-13:]=='genes_ko.list'): valueMap=parseLinkFile(options.mapFile) else: if options.parseStyle == GIS: keyType=int else: keyType=None valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType) else: valueMap=None for (inhandle,outhandle) in inputIterator(args, options): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) hitMap = parseM8File(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.koHitsOnly,sortReads=options.hitTableSortReads) # manipulate mappings hitMap = applySimpleCutoff(hitMap, options.cutoff, koTranslation) log("maps complete for %d reads" % (len(hitMap))) # print out hit table outhandle.write("Read\tHit\n") if options.outputStyle=='python': for read in sorted(hitMap.keys()): hit=hitMap[read] outhandle.write(str(read)) outhandle.write("\t") outhandle.write(repr(hit)) outhandle.write("\n") if options.outputStyle=='lines': for read in sorted(hitMap.keys()): hit=hitMap[read] if type(hit) is type([]): for h in sorted(hit): outhandle.write(str(read)) outhandle.write("\t") outhandle.write(str(h)) outhandle.write("\n") else: outhandle.write(str(read)) outhandle.write("\t") outhandle.write(str(hit)) outhandle.write("\n") else: for read in sorted(hitMap.keys()): hit=hitMap[read] outhandle.write(str(read)) if type(hit) is type([]): for h in sorted(hit): outhandle.write("\t") outhandle.write(str(h)) else: outhandle.write("\t") outhandle.write(str(hit)) outhandle.write("\n")
def main(): usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]" description = """ Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level. This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines. This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3. If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). """ parser = OptionParser(usage, description=description) addIOOptions(parser) parser.add_option("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_option('-s','--squash',dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.addPathOptions(parser) # log level and help addUniversalOptions(parser) (options, args) = parser.parse_args() setupLogging(options, description) # Set defaults and check for some conflicts if options.levels is None and options.heirarchyFile is None: # using hit names only options.levels=[None] else: if options.heirarchyFile is None and options.heirarchyType != 'cazy': logging.warn("Type: %s" % (options.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if options.levels is None: # set a default if options.heirarchyType is 'kegg': options.levels=['ko','1','2','pathway'] if options.heirarchyType is 'seed': options.levels=['role','1','2','subsystem'] else: options.levels=['gene','group'] try: # Make sure the level list makes sense options.levels=cleanLevels(options.levels) except Exception as e: parser.error(str(e)) # only print to stdout if there is a single input file if len(args)>1 and options.outfile is None: parser.error("STDOUT only works if a single input file is given!") # map reads to hits if options.mapFile is not None: if options.mapStyle == 'auto': with open(options.mapFile) as f: firstLine=f.next() while len(firstLine)==0 or firstLine[0]=='#': firstLine=f.next() if koMapRE.search(firstLine): options.mapStyle='kegg' elif seedMapRE.search(firstLine): options.mapStyle='seed' elif tabMapRE.search(firstLine): options.mapStyle='tab' #elif cogMapRE.search(firstLine): # options.mapStyle='cog' else: raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (options.mapStyle)) if options.mapStyle=='kegg': valueMap=kegg.parseLinkFile(options.mapFile) elif options.mapStyle=='seed': valueMap=kegg.parseSeedMap(options.mapFile) #elif options.mapStyle=='cog': # valueMap=kegg.parseCogMap(options.mapFile) else: if options.parseStyle == hits.GIS: keyType=int else: keyType=None valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType) if len(valueMap)>0: logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next())) else: logging.warn("Read 0 items into value map!") else: valueMap=None # set up level mapping levelMappers = [getLevelMapper(l,options) for l in options.levels] # parse input files for (inhandle,outhandle) in inputIterator(args, options): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads) outhandle.write("Read\t%s\n" % ('\t'.join(options.levels))) for read, hitIter in hitMapIter: assignments=[] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment=[] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings(assignment,options): outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): usage = "usage: %prog OPTIONS JSON_FILE(s)" description = """ Generates a sunburst plot for each input JSON tree. """ parser = OptionParser(usage, description=description) addIOOptions(parser) addUniversalOptions(parser) parser.add_option('-r', "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree") parser.add_option('-c', "--colors", default=None, help="Set colors by mapping node IDs to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry. If omitted, colors pulled from JSON (using colorkey) with red as the default. If present without --colorkey setting, colors in JSON will be ignored.") parser.add_option('-s','--sort', default=None, help="List of keys to sort on for plotting, NOTE: sorting on the value key will give suprising results for lower level nodes as sum of nested values will not be included. To get desired behavior, add a total value key to your tree and sort on that.") parser.add_option('-I','--idkey', default='name', help="String to use as key for node IDs. Default: %default") parser.add_option('-L','--labelkey', default='name', help="String to use as key for node labels. Default: %default") parser.add_option('-C','--colorkey', default='color', help="String to use as key for node colors. Default: %default") parser.add_option('-V','--valuekey', default='size', help="String to use as key for node sizes. Default: %default") parser.add_option('-K','--kidskey', default='children', help="String to use as key for list of child nodes. Default: %default") parser.add_option('-i', '--icicle', default=False, action='store_true', help="Print stacked bars in rectangular coordinates, not polar.") parser.add_option('-e', '--exterior_labels', default=False, action='store_true', help="Print labels for outermost nodes outside image") parser.add_option('-S', '--figsize', default=None, help="Comma separated pair of numbers (in inches) for figure size") parser.add_option("-f", "--format", dest="format", default='pdf', choices=['png','ps','pdf','svg'], help="Format for output image", metavar="FORMAT") (options, args) = parser.parse_args() # check arguments setupLogging(options, description) # setup matplotlib backend = options.format if backend=='png': backend='agg' matplotlib.use(backend) import matplotlib.pyplot as plt for (inhandle, outfile) in inputIterator(args, options): # import JSON tree=json.load(inhandle) # proecss user selected options kwargs=processOptions(options) # process JSON if options.colors is not None: setColors(tree, options.colors, **kwargs) if options.root is not None: newRoot=findNode(tree, options.root, **kwargs) if newRoot is not None: tree=newRoot # some of the matplotlib functions don't like extra arguments kwargs.pop(ID) # create figure plotSunburstJSON(tree,**kwargs) # save to file plt.savefig(outfile)