def main(): """" Set up the CLI """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-r", "--rank", dest="ranks", default=None, metavar="RANK", action="append", help=""" Rank(s) to collect counts on. Use flag multiple times to specify multiple ranks. If multiple values given, one table produced for each with rank name appended to file name. Defaults to all major ranks between phylum and species. Corresponds to rank names in nodes.dmp. To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in ncbi tax dir. Will also accept 'organism' to mean no rank (ie, just the organism name).""") parser.add_argument( "-s", "--collapseToDomain", default=False, action="store_true", help="Collapse all taxa below given rank down to " "superkingdom/domain. EG: in the genus output, anything " "assigned to Cyanobactia, will be lumped in with all " "other bacteria") parser.add_argument( "--proportional", dest="proportional", default=False, action="store_true", help="""When using tophit or toporg, redistribute proportionally instead of winner take all""") parser.add_argument( "-R", "--printRank", dest="printRanks", action="append", help="Include indeicated rank(s) in lineage of printed taxa. " "Will be ignored if beyond the rank of the taxa " "(IE We can't include species if the taxon being counted " "is genus)") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, tax dir, and more add_taxon_arguments( parser, choices={ 'countMethod': ( 'LCA', 'all', 'first', 'most', 'tophit', 'toporg', 'consensus')}) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if arguments.proportional and \ arguments.countMethod not in ['tophit', 'toporg']: parser.error("--proportinal only has meaning " "if using tophit or toporg") if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Handle the case where Galaxy tries to set None as a string arguments.ranks = checkNoneOption(arguments.ranks) arguments.printRanks = checkNoneOption(arguments.printRanks) logging.info("Printing out ranks: %r", arguments.ranks) # Set defaults and check for some conflicts if arguments.ranks is None and arguments.taxdir is None: # using hit names only arguments.ranks = [ORG_RANK] if arguments.printRanks is not None: parser.error("Display ranks are not used without taxonomic info") else: if arguments.taxdir is None: parser.error("Cannot select ranks without a taxonomy") if arguments.ranks is None: # set a default arguments.ranks = [ 'phylum', 'class', 'order', 'family', 'genus', 'species'] try: # Make sure the rank lists make sense arguments.ranks = cleanRanks(arguments.ranks) if arguments.printRanks is not None: arguments.printRanks = cleanRanks(arguments.printRanks) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single rank if len(arguments.ranks) > 1 and arguments.outfile is None: parser.error("STDOUT only works if a single rank is chosen!") # Because rank is used in parsing hits, we can only do multiple ranks for # certain kinds of count methods if len(arguments.ranks) > 1: rank = None if arguments.countMethod in ['consensus', 'most']: parser.error( "Using multiple ranks does not work with the 'consensus' " "or 'most' counting methods. LCA should give the same " "results as consensus. If you really want to do this, " "use a bash loop:'for rank in phylum order genus; do " "COMMAND -r ${rank}; done'") else: rank = arguments.ranks[0] # load necessary maps (taxonomy, hitStringMap) = readMaps(arguments) # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 params = FilterParams.create_from_arguments(arguments) if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg': # Process all files at once and use overall abundance to pick best hits from edl import redistribute multifile = redistribute.multipleFileWrapper(fileLabels.keys()) if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=not arguments.proportional, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into orgnaisms hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, taxonomy=taxonomy, hitStringMap=hitStringMap) translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0] else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, returnTranslations=True, winnerTakeAll=not arguments.proportional, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=ACCS) # Organisms will be returned, make translator trivial: translateHit = passThrough # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_name, read_name = read_name.split("/", 1) file_tag = fileLabels[unquote_plus(file_name)] taxon = translateHit(hit) taxcount = fileCounts[file_tag].setdefault(taxon, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][taxon] = taxcount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, hitStringMap, params, arguments.parseStyle, arguments.countMethod, taxonomy=taxonomy, rank=rank) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info( "parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="output_file", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, ortholog heirarchy, and more kegg.add_path_arguments( parser, defaults={'countMethod': 'tophit'}, choices={'countMethod': ('tophit', 'first', 'most', 'all', 'consensus')}, helps={'countMethod': ("How to deal with counts from multiple hits. ('first': " "just use the first hit, 'most': " "can return multiple hits, 'all': return every hit, " "consensus: return None unless all the same). Do not " "use most or consensus with more than one level at a time. " "Default is 'tophit': This breaks any ties by choosing " "the most abundant hit based on other unambiguous " "assignments.")}) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warning("Type: %s", arguments.heirarchyType) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the rank lists make sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single level if len(arguments.levels) > 1 and arguments.output_file is None: parser.error("STDOUT only works if a single level is chosen!") cutoff = arguments.cutoff # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' # elif cogMapRE.search(firstLine): # arguments.mapStyle='cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s", arguments.mapStyle) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) # elif arguments.mapStyle=='cog': # valueMap=kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == GIS: keyType = int else: keyType = None valueMap = parseMapFile( arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s", len(valueMap), next(iter(valueMap.items()))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 params = FilterParams.create_from_arguments(arguments) # TODO: incorporate weights into tophit algorithm! if arguments.countMethod == 'tophit': # Process all files at once and use overall abundance to pick best hits from edl import redistribute multifile = redistribute.multipleFileWrapper(fileLabels.items()) # don't give any hit translation, just use hit ids for redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into Genes (kos, families) hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, hitStringMap=valueMap) # translateHit = lambda hit: hitTranslator.translateHit(hit)[0] # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_tag, read_name = read_name.split("/", 1) file_tag = unquote_plus(file_tag) gene = hitTranslator.translateHit(hit)[0] if gene is None: gene = "None" logging.debug( "READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit, gene) genecount = fileCounts[file_tag].setdefault(gene, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][gene] = genecount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, valueMap, params, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info( "parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() logging.debug(repr(fileCounts)) printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
def main(): usage = "usage: %prog [OPTIONS] HIT_TABLE(S)" description = __doc__ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={"filterPct": 0, "parseStyle": ACCS, "countMethod": "tophit"}, choices={"countMethod": ("tophit", "toporg")}, ) parser.add_argument( "-P", "--proportional", dest="proportional", default=False, action="store_true", help="Assign reads that have multiple equal top hits to taxa such " "that the overal proportion of taxa is consistent with the " "unambiguious hits. This is meant for use with the 'toporg' " "count method.", ) parser.add_argument( "-i", "--individualFiles", dest="individual", default=False, action="store_true", help="Use this flag to process files independently. Normally, " "counts from all files are pooled for making choices.", ) add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # load necessary maps params = FilterParams.create_from_arguments(arguments) if arguments.countMethod == "toporg": (taxonomy, hitStringMap) = readMaps(arguments) wta = not (arguments.proportional) if len(arguments.input_files) <= 1 or arguments.individual: # loop over input for (inhandle, outhandle) in inputIterator(arguments): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) m8stream = M8Stream(inhandle) if arguments.countMethod == "tophit": # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, parseStyle=arguments.parseStyle ) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle, ) for line in readHits: outhandle.write(line) else: # process all files at once multifile = redistribute.multipleFileWrapper(arguments.input_files) # Build a map from input file name to output handle outputMap = {} for infile_handle in arguments.input_files: infile_name = infile_handle.name if arguments.output_file is None: outputMap[infile_name] = sys.stdout elif len(arguments.input_files) <= 1: outputMap[infile_name] = open(arguments.output_file, "w") else: # use outfileName as suffix if arguments.cwd: # strip path info first (infilePath, infileFile) = os.path.split(infile_name) outfile = "./" + infileFile + arguments.output_file else: outfile = infile_name + arguments.output_file outputMap[infile_name] = open(outfile, "w") if arguments.countMethod == "tophit": # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, parseStyle=arguments.parseStyle ) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle, ) for (read, hit) in readHits: infile_name, read = read.split("/", 1) outhandle = outputMap[unquote_plus(infile_name)] outhandle.write(hit.line.split("/", 1)[1]) if arguments.output_file is not None: for outhandle in outputMap.values(): outhandle.close()
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments(parser, defaults={ 'filter_top_pct': 0, 'parseStyle': ACCS, 'countMethod': 'tophit' }, choices={'countMethod': ('tophit', 'toporg')}) parser.add_argument( "-P", "--proportional", dest="proportional", default=False, action="store_true", help="Assign reads that have multiple equal top hits to taxa such " "that the overal proportion of taxa is consistent with the " "unambiguious hits. This is meant for use with the 'toporg' " "count method.") parser.add_argument( "-i", "--individualFiles", dest="individual", default=False, action="store_true", help="Use this flag to process files independently. Normally, " "counts from all files are pooled for making choices.") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # load necessary maps params = FilterParams.create_from_arguments(arguments) if arguments.countMethod == 'toporg': (taxonomy, hitStringMap) = readMaps(arguments) wta = not (arguments.proportional) if len(arguments.input_files) <= 1 or arguments.individual: # loop over input for (inhandle, outhandle) in inputIterator(arguments): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) m8stream = M8Stream(inhandle) if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( m8stream, filterParams=params, returnLines=True, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for line in readHits: outhandle.write(line) else: # process all files at once multifile = redistribute.multipleFileWrapper(arguments.input_files) # Build a map from input file name to output handle outputMap = {} for infile_handle in arguments.input_files: infile_name = infile_handle.name if arguments.output_file is None: outputMap[infile_name] = sys.stdout elif len(arguments.input_files) <= 1: outputMap[infile_name] = open(arguments.output_file, 'w') else: # use outfileName as suffix if arguments.cwd: # strip path info first (infilePath, infileFile) = os.path.split(infile_name) outfile = "./" + infileFile + arguments.output_file else: outfile = infile_name + arguments.output_file outputMap[infile_name] = open(outfile, 'w') if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, parseStyle=arguments.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=wta, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=arguments.parseStyle) for (read, hit) in readHits: infile_name, read = read.split("/", 1) outhandle = outputMap[unquote_plus(infile_name)] outhandle.write(hit.line.split("/", 1)[1]) if arguments.output_file is not None: for outhandle in outputMap.values(): outhandle.close()
def main(): usage = "usage: %prog [OPTIONS] HIT_TABLE(S)" description = """ Takes an m8 blast and picks the best hit for each. First, only the best scores are used, but if there is a tie (aka ambiguous hit), than a winner is assigned so that the proportion reads assigned to each organism matches the proportion of unambiguos hits. FilterPct defaults to 0, but can be altered, but I don't recommend it. ParseStyle and countMethod are ignored. """ parser = OptionParser(usage, description=description) addIOOptions(parser) addTaxonOptions(parser,defaults={'filterPct':0,'parseStyle':ACCS,'countMethod':'tophit'},choices={'countMethod':('tophit','toporg')}) addUniversalOptions(parser) parser.add_option("-i","--individualFiles", dest="individual", default=False, action="store_true", help="Use this flag to process files independently. Normally, counts from all files are pooled for making choices.") (options, args) = parser.parse_args() setupLogging(options,description) # load necessary maps params = FilterParams.createFromOptions(options) if options.countMethod=='toporg': (taxonomy,hitStringMap)=readMaps(options) if len(args)<=1 or options.individual: # loop over input for (inhandle,outhandle) in inputIterator(args, options): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) m8stream=M8Stream(inhandle) if options.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for redistribution readHits = redistribute.pickBestHitByAbundance(m8stream, filterParams=params, returnLines=True, winnerTakeAll=True, parseStyle=options.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance(m8stream, filterParams=params, returnLines=True, winnerTakeAll=True, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=options.parseStyle) for line in readHits: outhandle.write(line) else: # process all files at once (multifile,readFileDict) = redistribute.multipleFileWrapper(args, params, returnLines=True) # Build a map from input file name to output handle outputMap={} for infileName in args: if options.outfile is None: outputMap[infileName]=sys.stdout elif len(args)<=1: outputMap[infileName]=open(options.outfile,'w') else: # use outfileName as suffix if options.cwd: # strip path info first (infilePath,infileFile)=os.path.split(infileName) outfile="./"+infileFile+options.outfile else: outfile=infileName+options.outfile outputMap[infileName]=open(outfile,'w') if options.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for redistribution readHits = redistribute.pickBestHitByAbundance(multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=options.parseStyle) else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance(multifile, filterParams=params, returnLines=False, winnerTakeAll=True, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=options.parseStyle) for (read, hit) in readHits: outhandle = outputMap[readFileDict[read]] outhandle.write(hit.line) if options.outfile is not None: for outhandle in outputMap.itervalues(): outhandle.close()
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="output_file", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, ortholog heirarchy, and more kegg.add_path_arguments( parser, defaults={'countMethod': 'tophit'}, choices={ 'countMethod': ('tophit', 'first', 'most', 'all', 'consensus') }, helps={ 'countMethod': ("How to deal with counts from multiple hits. ('first': " "just use the first hit, 'most': " "can return multiple hits, 'all': return every hit, " "consensus: return None unless all the same). Do not " "use most or consensus with more than one level at a time. " "Default is 'tophit': This breaks any ties by choosing " "the most abundant hit based on other unambiguous " "assignments.") }) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warning("Type: %s", arguments.heirarchyType) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the rank lists make sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single level if len(arguments.levels) > 1 and arguments.output_file is None: parser.error("STDOUT only works if a single level is chosen!") cutoff = arguments.cutoff # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' # elif cogMapRE.search(firstLine): # arguments.mapStyle='cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s", arguments.mapStyle) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) # elif arguments.mapStyle=='cog': # valueMap=kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == GIS: keyType = int else: keyType = None valueMap = parseMapFile(arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s", len(valueMap), next(iter(valueMap.items()))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 # TODO: incorporate weights into tophit algorithm! if arguments.countMethod == 'tophit': # Process all files at once and use overall abundance to pick best hits from edl import redistribute params = FilterParams.create_from_arguments(arguments) multifile = redistribute.multipleFileWrapper(fileLabels.items()) # don't give any hit translation, just use hit ids for redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into Genes (kos, families) hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, hitStringMap=valueMap) # translateHit = lambda hit: hitTranslator.translateHit(hit)[0] # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_tag, read_name = read_name.split("/", 1) file_tag = unquote_plus(file_tag) gene = hitTranslator.translateHit(hit)[0] if gene is None: gene = "None" logging.debug("READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit, gene) genecount = fileCounts[file_tag].setdefault(gene, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][gene] = genecount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, valueMap, arguments.hitTableFormat, arguments.filter_top_pct, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info("parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() logging.debug(repr(fileCounts)) printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
def main(): """" Set up the CLI """ parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-r", "--rank", dest="ranks", default=None, metavar="RANK", action="append", help=""" Rank(s) to collect counts on. Use flag multiple times to specify multiple ranks. If multiple values given, one table produced for each with rank name appended to file name. Defaults to all major ranks between phylum and species. Corresponds to rank names in nodes.dmp. To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in ncbi tax dir. Will also accept 'organism' to mean no rank (ie, just the organism name).""") parser.add_argument( "-s", "--collapseToDomain", default=False, action="store_true", help="Collapse all taxa below given rank down to " "superkingdom/domain. EG: in the genus output, anything " "assigned to Cyanobactia, will be lumped in with all " "other bacteria") parser.add_argument( "-R", "--printRank", dest="printRanks", action="append", help="Include indeicated rank(s) in lineage of printed taxa. " "Will be ignored if beyond the rank of the taxa " "(IE We can't include species if the taxon being counted " "is genus)") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, tax dir, and more add_taxon_arguments(parser, choices={ 'countMethod': ('LCA', 'all', 'first', 'most', 'tophit', 'toporg', 'consensus') }) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Handle the case where Galaxy tries to set None as a string arguments.ranks = checkNoneOption(arguments.ranks) arguments.printRanks = checkNoneOption(arguments.printRanks) logging.info("Printing out ranks: %r", arguments.ranks) # Set defaults and check for some conflicts if arguments.ranks is None and arguments.taxdir is None: # using hit names only arguments.ranks = [ORG_RANK] if arguments.printRanks is not None: parser.error("Display ranks are not used without taxonomic info") else: if arguments.taxdir is None: parser.error("Cannot select ranks without a taxonomy") if arguments.ranks is None: # set a default arguments.ranks = [ 'phylum', 'class', 'order', 'family', 'genus', 'species' ] try: # Make sure the rank lists make sense arguments.ranks = cleanRanks(arguments.ranks) if arguments.printRanks is not None: arguments.printRanks = cleanRanks(arguments.printRanks) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single rank if len(arguments.ranks) > 1 and arguments.outfile is None: parser.error("STDOUT only works if a single rank is chosen!") # Because rank is used in parsing hits, we can only do multiple ranks for # certain kinds of count methods if len(arguments.ranks) > 1: rank = None if arguments.countMethod in ['consensus', 'most']: parser.error( "Using multiple ranks does not work with the 'consensus' " "or 'most' counting methods. LCA should give the same " "results as consensus. If you really want to do this, " "use a bash loop:'for rank in phylum order genus; do " "COMMAND -r ${rank}; done'") else: rank = arguments.ranks[0] # load necessary maps (taxonomy, hitStringMap) = readMaps(arguments) # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg': # Process all files at once and use overall abundance to pick best hits from edl import redistribute params = FilterParams.create_from_arguments(arguments) multifile = redistribute.multipleFileWrapper(fileLabels.keys()) if arguments.countMethod == 'tophit': # don't give any taxonomy, just map to accessions for # redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into orgnaisms hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, taxonomy=taxonomy, hitStringMap=hitStringMap) translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0] else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, returnTranslations=True, winnerTakeAll=True, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=ACCS) # Organisms will be returned, make translator trivial: translateHit = passThrough # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_name, read_name = read_name.split("/", 1) file_tag = fileLabels[unquote_plus(file_name)] taxon = translateHit(hit) taxcount = fileCounts[file_tag].setdefault(taxon, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][taxon] = taxcount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, hitStringMap, arguments.hitTableFormat, arguments.filter_top_pct, arguments.parseStyle, arguments.countMethod, taxonomy=taxonomy, rank=rank) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info("parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)
def main(): usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]" description = """ Takes m8 blast files and generates a table of taxon hit counts for the given rank. Columns are input files and rows are taxa. If multiple ranks given (the default), multiple output files are produced, each with the rank name appended to the output file name. """ parser = OptionParser(usage, description=description) parser.add_option("-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_option( "-r", "--rank", dest="ranks", default=None, metavar="RANK", action="append", help=""" Rank(s) to collect counts on. Use flag multiple times to specify multiple ranks. If multiple values given, one table produced for each with rank name appended to file name. Defaults to all major ranks between phylum and species. Corresponds to rank names in nodes.dmp. To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in ncbi tax dir. Will also accept 'organism' to mean no rank (ie, just the organism name).""", ) parser.add_option( "-s", "--collapseToDomain", default=False, action="store_true", help="Collapse all taxa below given rank down to superkingdom/domain. EG: in the genus output, anything assigned to Cyanobactia, will be lumped in with all other bacteria", ) parser.add_option( "-R", "--printRank", dest="printRanks", action="append", help="Include indeicated rank(s) in lineage of printed taxa. Will be ignored if beyond the rank of the taxa (IE We can't include species if the taxon being counted is genus)", ) # option for deconvoluting clusters or assemblies addWeightOption(parser, multiple=True) # cutoff options addCountOptions(parser) # format, tax dir, and more addTaxonOptions(parser, choices={"countMethod": ("LCA", "all", "first", "most", "tophit", "toporg", "consensus")}) # log level and help addUniversalOptions(parser) (options, args) = parser.parse_args() setupLogging(options, description) if len(args) == 0: parser.error("Must supply at least one m8 file to parse") # Handle the case where Galaxy tries to set None as a string options.ranks = checkNoneOption(options.ranks) options.printRanks = checkNoneOption(options.printRanks) # Set defaults and check for some conflicts if options.ranks is None and options.taxdir is None: # using hit names only options.ranks = [ORG_RANK] if options.printRanks is not None: parser.error("Display ranks are not used without taxonomic info") else: if options.taxdir is None: parser.error("Cannot select ranks without a taxonomy") if options.ranks is None: # set a default options.ranks = ["phylum", "class", "order", "family", "genus", "species"] try: # Make sure the rank lists make sense options.ranks = cleanRanks(options.ranks) if options.printRanks is not None: options.printRanks = cleanRanks(options.printRanks) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(options.weights) # only print to stdout if there is a single rank if len(options.ranks) > 1 and options.outfile is None: parser.error("STDOUT only works if a single rank is chosen!") cutoff = options.cutoff # Because rank is used in parsing hits, we can only do multiple ranks for # certain kinds of count methods if len(options.ranks) > 1: rank = None if options.countMethod in ["consensus", "most"]: parser.error( "Using multiple ranks does not work with the 'consensus' or 'most' counting methods. LCA should give the same results as consensus. If you really want to do this, us a bash loop:'for rank in phylum order genus; do COMMAND -r ${rank}; done'" ) else: rank = options.ranks[0] # load necessary maps (taxonomy, hitStringMap) = readMaps(options) # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in args: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 if options.countMethod == "tophit" or options.countMethod == "toporg": # Process all files at once and use overall abundance to pick best hits from edl import redistribute params = FilterParams.createFromOptions(options) (multifile, readFileDict) = redistribute.multipleFileWrapper(fileLabels.keys(), params, returnLines=True) if options.countMethod == "tophit": # don't give any taxonomy, just map to accessions for redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=options.parseStyle, sequenceWeights=sequenceWeights, ) # define method to turn Hits into orgnaisms hitTranslator = getHitTranslator( parseStyle=options.parseStyle, taxonomy=taxonomy, hitStringMap=hitStringMap ) translateHit = lambda hit: hitTranslator.translateHit(hit)[0] else: # translate to organism before finding most abundant readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, returnTranslations=True, winnerTakeAll=True, taxonomy=taxonomy, hitStringMap=hitStringMap, parseStyle=hits.ACCS, ) # Organisms will be returned, make translator trivial: translateHit = lambda hit: hit # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read, hit) in readHits: filename = readFileDict[read] filetag = fileLabels[filename] taxon = translateHit(hit) taxcount = fileCounts[filetag].setdefault(taxon, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read, 1) fileCounts[filetag][taxon] = taxcount + increment totals[filetag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.iteritems(): infile = open(filename, "rU") hitIter = parseM8FileIter( infile, hitStringMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, taxonomy=taxonomy, rank=rank, sortReads=options.hitTableSortReads, ) (total, counts, hitMap) = countIterHits(hitIter, allMethod=options.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info( "parsed %d hits (%d unique) for %d reads from %s" % (total, len(counts), len(hitMap), filename) ) infile.close() printCountTablesByRank(fileCounts, totals, sortedLabels, options)