def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="output_file", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, ortholog heirarchy, and more kegg.add_path_arguments( parser, defaults={'countMethod': 'tophit'}, choices={'countMethod': ('tophit', 'first', 'most', 'all', 'consensus')}, helps={'countMethod': ("How to deal with counts from multiple hits. ('first': " "just use the first hit, 'most': " "can return multiple hits, 'all': return every hit, " "consensus: return None unless all the same). Do not " "use most or consensus with more than one level at a time. " "Default is 'tophit': This breaks any ties by choosing " "the most abundant hit based on other unambiguous " "assignments.")}) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warning("Type: %s", arguments.heirarchyType) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the rank lists make sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single level if len(arguments.levels) > 1 and arguments.output_file is None: parser.error("STDOUT only works if a single level is chosen!") cutoff = arguments.cutoff # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' # elif cogMapRE.search(firstLine): # arguments.mapStyle='cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s", arguments.mapStyle) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) # elif arguments.mapStyle=='cog': # valueMap=kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == GIS: keyType = int else: keyType = None valueMap = parseMapFile( arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s", len(valueMap), next(iter(valueMap.items()))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 params = FilterParams.create_from_arguments(arguments) # TODO: incorporate weights into tophit algorithm! if arguments.countMethod == 'tophit': # Process all files at once and use overall abundance to pick best hits from edl import redistribute multifile = redistribute.multipleFileWrapper(fileLabels.items()) # don't give any hit translation, just use hit ids for redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into Genes (kos, families) hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, hitStringMap=valueMap) # translateHit = lambda hit: hitTranslator.translateHit(hit)[0] # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_tag, read_name = read_name.split("/", 1) file_tag = unquote_plus(file_tag) gene = hitTranslator.translateHit(hit)[0] if gene is None: gene = "None" logging.debug( "READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit, gene) genecount = fileCounts[file_tag].setdefault(gene, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][gene] = genecount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, valueMap, params, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info( "parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() logging.debug(repr(fileCounts)) printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
def main(): description = __doc__ parser = argparse.ArgumentParser(description) add_IO_arguments(parser) parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_argument( '-s', '--squash', dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, " "just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.add_path_arguments(parser) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warn("Type: %s" % (arguments.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the level list makes sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' elif cogMapRE.search(firstLine): arguments.mapStyle = 'cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (arguments.mapStyle)) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) elif arguments.mapStyle == 'cog': valueMap = kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == hits.GIS: keyType = int else: keyType = None valueMap = parseMapFile( arguments.mapFile, valueType=None, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s" % (len(valueMap), next(iter(valueMap.items())))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # set up level mapping levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels] # parse input files for (inhandle, outhandle) in inputIterator(arguments): logging.debug( "Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter( inhandle, valueMap, arguments.hitTableFormat, arguments.filterTopPct, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) if arguments.levels == [None]: arguments.levels = ['Hit'] outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels))) for read, hitIter in hitMapIter: assignments = [] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment = [] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings( assignment, arguments): outhandle.write( "%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): description = __doc__ parser = argparse.ArgumentParser(description) add_IO_arguments(parser) parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_argument( '-S', '--squash', dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, " "just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.add_path_arguments(parser) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warn("Type: %s" % (arguments.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the level list makes sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' elif cogMapRE.search(firstLine): arguments.mapStyle = 'cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (arguments.mapStyle)) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) elif arguments.mapStyle == 'cog': valueMap = kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == hits.GIS: keyType = int else: keyType = None valueMap = parseMapFile(arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s" % (len(valueMap), next(iter(valueMap.items())))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # set up level mapping levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels] # parse input files for (inhandle, outhandle) in inputIterator(arguments): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter( inhandle, valueMap, hits.FilterParams.create_from_arguments(arguments), arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) if arguments.levels == [None]: arguments.levels = ['Hit'] outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels))) for read, hitIter in hitMapIter: assignments = [] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment = [] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings( assignment, arguments): outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]" description = """ Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level. This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines. This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3. If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). """ parser = OptionParser(usage, description=description) addIOOptions(parser) parser.add_option("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") parser.add_option('-s','--squash',dest='splitForLevels', default=True, action='store_false', help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax") # format, ortholog heirarchy, and more kegg.addPathOptions(parser) # log level and help addUniversalOptions(parser) (options, args) = parser.parse_args() setupLogging(options, description) # Set defaults and check for some conflicts if options.levels is None and options.heirarchyFile is None: # using hit names only options.levels=[None] else: if options.heirarchyFile is None and options.heirarchyType != 'cazy': logging.warn("Type: %s" % (options.heirarchyType)) parser.error("Cannot select levels without a heirarchy (ko) file") if options.levels is None: # set a default if options.heirarchyType is 'kegg': options.levels=['ko','1','2','pathway'] if options.heirarchyType is 'seed': options.levels=['role','1','2','subsystem'] else: options.levels=['gene','group'] try: # Make sure the level list makes sense options.levels=cleanLevels(options.levels) except Exception as e: parser.error(str(e)) # only print to stdout if there is a single input file if len(args)>1 and options.outfile is None: parser.error("STDOUT only works if a single input file is given!") # map reads to hits if options.mapFile is not None: if options.mapStyle == 'auto': with open(options.mapFile) as f: firstLine=f.next() while len(firstLine)==0 or firstLine[0]=='#': firstLine=f.next() if koMapRE.search(firstLine): options.mapStyle='kegg' elif seedMapRE.search(firstLine): options.mapStyle='seed' elif tabMapRE.search(firstLine): options.mapStyle='tab' #elif cogMapRE.search(firstLine): # options.mapStyle='cog' else: raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s" % (options.mapStyle)) if options.mapStyle=='kegg': valueMap=kegg.parseLinkFile(options.mapFile) elif options.mapStyle=='seed': valueMap=kegg.parseSeedMap(options.mapFile) #elif options.mapStyle=='cog': # valueMap=kegg.parseCogMap(options.mapFile) else: if options.parseStyle == hits.GIS: keyType=int else: keyType=None valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType) if len(valueMap)>0: logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next())) else: logging.warn("Read 0 items into value map!") else: valueMap=None # set up level mapping levelMappers = [getLevelMapper(l,options) for l in options.levels] # parse input files for (inhandle,outhandle) in inputIterator(args, options): logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle)) hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads) outhandle.write("Read\t%s\n" % ('\t'.join(options.levels))) for read, hitIter in hitMapIter: assignments=[] for hit in hitIter: logging.debug("Hit: %s" % (hit)) assignment=[] for levelMapper in levelMappers: assignment.append(levelMapper(hit)) assignments.append(assignment) logging.debug("Read %s has %d hits" % (read, len(assignments))) for assignment in assignments: for assignmentList in handleMultipleMappings(assignment,options): outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
def main(): description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument("input_files", nargs="+", default=[], metavar="INFILE", help="List of hit tables to process") parser.add_argument("-o", "--outfile", dest="output_file", metavar="OUTFILE", help="Write count table to OUTFILE") parser.add_argument("-l", "--level", dest="levels", default=None, metavar="LEVEL", action="append", help=""" Level(s) to collect counts on. Use flag multiple times to specify multiple levels. If multiple values given, one table produced for each with rank name appended to file name. Levels can be an integer (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 'ko', or 'ortholog' (which are all synonyms), or anything not synonymous with 'gene' to get CAZy groups. Defaults to ortholog/role and levels 1, 2, and 3 for KEGG and SEED and gene and group for CAZy and COG.""") # option for deconvoluting clusters or assemblies add_weight_arguments(parser, multiple=True) # cutoff options add_count_arguments(parser) # format, ortholog heirarchy, and more kegg.add_path_arguments( parser, defaults={'countMethod': 'tophit'}, choices={ 'countMethod': ('tophit', 'first', 'most', 'all', 'consensus') }, helps={ 'countMethod': ("How to deal with counts from multiple hits. ('first': " "just use the first hit, 'most': " "can return multiple hits, 'all': return every hit, " "consensus: return None unless all the same). Do not " "use most or consensus with more than one level at a time. " "Default is 'tophit': This breaks any ties by choosing " "the most abundant hit based on other unambiguous " "assignments.") }) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) if len(arguments.input_files) == 0: parser.error("Must supply at least one m8 file to parse") # Set defaults and check for some conflicts if arguments.levels is None and arguments.heirarchyFile is None: # using hit names only arguments.levels = [None] else: if arguments.heirarchyFile is None \ and arguments.heirarchyType != 'cazy': logging.warning("Type: %s", arguments.heirarchyType) parser.error("Cannot select levels without a heirarchy (ko) file") if arguments.levels is None: # set a default if arguments.heirarchyType is 'kegg': arguments.levels = ['ko', '1', '2', 'pathway'] if arguments.heirarchyType is 'seed': arguments.levels = ['role', '1', '2', 'subsystem'] else: arguments.levels = ['gene', 'group'] try: # Make sure the rank lists make sense arguments.levels = cleanLevels(arguments.levels) except Exception as e: parser.error(str(e)) # load weights file sequenceWeights = loadSequenceWeights(arguments.weights) # only print to stdout if there is a single level if len(arguments.levels) > 1 and arguments.output_file is None: parser.error("STDOUT only works if a single level is chosen!") cutoff = arguments.cutoff # map reads to hits if arguments.mapFile is not None: if arguments.mapStyle == 'auto': with open(arguments.mapFile) as f: firstLine = next(f) while len(firstLine) == 0 or firstLine[0] == '#': firstLine = next(f) if koMapRE.search(firstLine): arguments.mapStyle = 'kegg' elif seedMapRE.search(firstLine): arguments.mapStyle = 'seed' elif tabMapRE.search(firstLine): arguments.mapStyle = 'tab' # elif cogMapRE.search(firstLine): # arguments.mapStyle='cog' else: raise Exception( "Cannot figure out map type from first line:\n%s" % (firstLine)) logging.info("Map file seems to be: %s", arguments.mapStyle) if arguments.mapStyle == 'kegg': valueMap = kegg.parseLinkFile(arguments.mapFile) elif arguments.mapStyle == 'seed': valueMap = kegg.parseSeedMap(arguments.mapFile) # elif arguments.mapStyle=='cog': # valueMap=kegg.parseCogMap(arguments.mapFile) else: if arguments.parseStyle == GIS: keyType = int else: keyType = None valueMap = parseMapFile(arguments.mapFile, valueType=None, valueDelim=arguments.tab_map_delim, keyType=keyType) if len(valueMap) > 0: logging.info("Read %d items into map. EG: %s", len(valueMap), next(iter(valueMap.items()))) else: logging.warn("Read 0 items into value map!") else: valueMap = None # parse input files fileCounts = {} totals = {} fileLabels = {} sortedLabels = [] # Allow for file names to be preceded with TAG= for filename in arguments.input_files: bits = filename.split("=", 1) if len(bits) > 1: (filetag, filename) = bits else: filetag = filename fileLabels[filename] = filetag # keep order so that column order matches arguments sortedLabels.append(filetag) fileCounts[filetag] = {} totals[filetag] = 0 # TODO: incorporate weights into tophit algorithm! if arguments.countMethod == 'tophit': # Process all files at once and use overall abundance to pick best hits from edl import redistribute params = FilterParams.create_from_arguments(arguments) multifile = redistribute.multipleFileWrapper(fileLabels.items()) # don't give any hit translation, just use hit ids for redistribution readHits = redistribute.pickBestHitByAbundance( multifile, filterParams=params, returnLines=False, winnerTakeAll=True, parseStyle=arguments.parseStyle, sequenceWeights=sequenceWeights) # define method to turn Hits into Genes (kos, families) hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle, hitStringMap=valueMap) # translateHit = lambda hit: hitTranslator.translateHit(hit)[0] # use read->file mapping and hit translator to get file based counts # from returned (read,Hit) pairs increment = 1 for (read_name, hit) in readHits: file_tag, read_name = read_name.split("/", 1) file_tag = unquote_plus(file_tag) gene = hitTranslator.translateHit(hit)[0] if gene is None: gene = "None" logging.debug("READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit, gene) genecount = fileCounts[file_tag].setdefault(gene, 0) if sequenceWeights is not None: increment = sequenceWeights.get(read_name, 1) fileCounts[file_tag][gene] = genecount + increment totals[file_tag] += increment logging.debug(str(totals)) else: # Original way, just process each file separately for (filename, filetag) in fileLabels.items(): infile = open(filename, 'rU') hitIter = parseM8FileIter(infile, valueMap, arguments.hitTableFormat, arguments.filter_top_pct, arguments.parseStyle, arguments.countMethod, ignoreEmptyHits=arguments.mappedHitsOnly) (total, counts, hitMap) = \ countIterHits(hitIter, allMethod=arguments.allMethod, weights=sequenceWeights) fileCounts[filetag] = counts totals[filetag] = total logging.info("parsed %d hits (%d unique) for %d reads from %s", total, len(counts), len(hitMap), filename) infile.close() logging.debug(repr(fileCounts)) printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)