def parseM8FileIter( inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # check filtering options if countMethod == 'first': scorePct = -1 # get map from reads to lists of hit strings logger.info("Parsing hits") options = FilterParams() options.format = format if scorePct >= 0 or sortReads: # filter hits on score if requested if scorePct >= 0: logger.info( "Filtering for scores within %s pct of best" % scorePct) options.topPct = scorePct options.sort = 'score' options.sortReads = sortReads # filters and parses options.parseStyle = parsingStyle hitIter = filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits( hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) # apply count method hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def parseAndFilterM8Stream(inhandle, options): """ runs the input stream through m8 filtering and then through parseM8Hits to get map from each read to all hits """ inhandle = filterM8Stream(inhandle, options, returnLines=False) logger.info("Parsing hits") # since filter already parses hits, use that info infoInDescription = options.parseStyle in [KEGG, ORGS, PFAM] return parseM8Hits(inhandle, infoInDescription)
def parseAndFilterM8Stream(inhandle, options): """ runs the input stream through m8 filtering and then through parseM8Hits to get map from each read to all hits """ inhandle = filterM8Stream(inhandle, options, return_lines=False) logger.info("Parsing hits") # since filter already parses hits, use that info infoInDescription = options.parseStyle in [KEGG, ORGS, PFAM] return parseM8Hits(inhandle, infoInDescription)
def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} with InputFile(blastFile) as m8stream: params = FilterParams.create_from_arguments(options) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, return_lines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % \ (hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % \ (hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits from %d lines to %d reads" % (keepcount, hitcount, m8stream.lines, readcount)) return hitMap
def getSequenceHits(hitsFile, params): """ build a map from sequences to their hits """ sequenceHits = {} hitCount = 0 with InputFile(hitsFile) as m8stream: for seqid, hits in filterM8Stream(m8stream, params, return_lines=False): if len(hits) == 0: continue hitCount += len(hits) sequenceHits[seqid] = hits logging.debug("Parsed %d hits for %d sequences fromm %d lines" % (hitCount, len(sequenceHits), m8stream.lines)) return sequenceHits
def parseM8FileIter(inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # check filtering options if countMethod == 'first': scorePct=-1 # setup some variables infoInDescription = parsingStyle in [KEGG,ORGS] # get map from reads to lists of hit strings logger.info("Parsing hits") options=FilterParams() options.format=format if scorePct >= 0 or sortReads: # filter hits on score if requested if scorePct>=0: logger.info("Filtering for scores within %s pct of best" % scorePct) options.topPct=scorePct options.sort='score' options.sortReads=sortReads # filters and parses options.parseStyle=parsingStyle hitIter=filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits(hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) #debugKey="F4UZ9WW02HMBZJ" #logger.debug("Hits for %s: %r" % (debugKey,hitMap[debugKey])) # apply count method hitIter=applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def loadHitRegions(blastFile, minLength, options): """ Parse a hit table into a map from read names to lists of (start,end,annot) """ hitMap = {} params = FilterParams.create_from_arguments(options) m8stream = M8Stream(blastFile) hitcount = 0 readcount = 0 keepcount = 0 for (read, hits) in filterM8Stream(m8stream, params, returnLines=False): readcount += 1 hitTuples = [] for hit in hits: hitcount += 1 if abs(hit.qstart - hit.qend) + 1 < minLength: continue keepcount += 1 if hit.format == GFF: annot = "# %d # %d # %s # %s;evalue=%s" % ( hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue) else: try: annot = "%s [%d,%d] %0.1f%% %d bits" % ( hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score) except AttributeError: annot = "%s [%d,%d] score: %d" % ( hit.hit, hit.hstart, hit.hend, hit.score) if hit.format == GFF: reverse = hit.strand != "+" else: reverse = hit.hstart > hit.hend if reverse: # reverse if hit is backwards hitTuples.append((hit.qend, hit.qstart, annot)) else: hitTuples.append((hit.qstart, hit.qend, annot)) hitMap[read] = hitTuples logging.debug( "Kept %d of %d hits to %d reads" % (keepcount, hitcount, readcount)) return hitMap
def parseM8FileIter(inhandle, hitStringMap, options, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, ): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # get map from reads to lists of hit strings logger.info("Parsing hits") # filters and parses # options.parseStyle = parsingStyle hitIter = filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits( hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) # apply count method hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def parseM8FileIter( inhandle, hitStringMap, options, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, ): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # get map from reads to lists of hit strings logger.info("Parsing hits") # filters and parses # options.parseStyle = parsingStyle hitIter = filterM8Stream(inhandle, options, return_lines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits(hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) # apply count method hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def main(): description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None} ) parser.add_argument( "-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action="append", help="Taxon to identify reads in. Top hits (as defined by " "--topHitPct) must be in this group. It can be a taxid, " "a name, or a file listing taxids. Use multiple times to " "specify a list of organisms. Use -a to specify whether " "all or at least one of the top hits must match.", ) parser.add_argument( "-a", "--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism " "in the target taxon/taxa. By default, all top hits must be " "in the target group.", ) parser.add_argument( "-t", "--topHitPct", default=0, type=float, help="How close(as a percentage to the best score a hit must be " "to qualify as a top hit. Default is 0, ie must have the best " "score. Use 100 to get all hits.", ) parser.add_argument( "-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit " "these hits) must be in the target group or this group. Again, " "it can be a taxid, a name, or a file listing taxids. " "It can also be inkoved multiple times to choose multiple " "groups.", ) parser.add_argument( "-r", "--reads", default=False, action="store_true", help="Output just read names. By default, print the relevant hit " "lines for each read", ) # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check args if arguments.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if arguments.taxdir is not None: taxonomy = readTaxonomy(arguments.taxdir, namesMap=True) else: taxonomy = None group_1_set = get_group_set(arguments.group1, taxonomy) group_2_set = get_group_set(arguments.group2, taxonomy) logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set)) if group_2_set is not None: logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set)) # map reads to hits if arguments.parseStyle == GIS: keyType = int else: keyType = None accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType) # set up some function pointers global hitRE hitRE = parsingREs.get(arguments.parseStyle, None) if arguments.parseStyle == ORGS: getTaxid = _getOrgTaxid elif arguments.parseStyle == HITID: getTaxid = _getHitidTaxid elif arguments.parseStyle == HITDESC: getTaxid = _getHitdescTaxid else: getTaxid = _getExprTaxid # for filtering: filterParams = FilterParams.create_from_arguments(arguments) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle, outhandle) in inputIterator(arguments): readCount = 0 goodReadCount = 0 printCount = 0 # parse file for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False): readCount += 1 bestScore = 0 hitTaxids = {} for hit in hits: score = hit.score taxids = [] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit, accToTaxMap, taxonomy): if taxid is None: break if group_2_set is not None and taxid not in group_2_set: break taxids.append(taxid) if len(taxids) == 0: # nothing matched in the wider group break hitTaxids[hit] = taxids # find the top score if score > bestScore: bestScore = score else: # if we get here, then every hit was in wider taxon list logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore)) all = True recognized = [] for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct): if _anyTaxidInGroup(taxids, group_1_set): logging.debug("%s (%r) is in group 1" % (hit, taxids)) recognized.append(hit) else: logging.debug("%s (%r) is not in group 1" % (hit, taxids)) all = False if len(recognized) == 0: # if none of the best are in our target list, next read logging.debug("No best hits for %s are in group 1" % (read)) continue if (not arguments.any) and (not all): # next read unless user said any or all hits are in list logging.debug("Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount += 1 if arguments.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write("\n") else: logging.debug("Keeping %d hits for %s" % (len(recognized), read)) for hit in sorted(recognized, key=lambda h: (h.score, h.hit)): outhandle.write(hit.getLine(filterParams)) printCount += 1 if arguments.reads: logging.info("Printed %d of %d reads" % (goodReadCount, readCount)) else: logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
def main(): usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES" description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = OptionParser(usage, description=description) addIOOptions(parser) addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None}) parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append', help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.") parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.") addUniversalOptions(parser) parser.add_option('-t','--topHitPct', default=0, type='float', help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.') parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.") parser.add_option('-r','--reads', default=False, action="store_true", help="Output just read names. By default, print the relevant hit lines for each read") (options, args) = parser.parse_args() if options.about: print description exit(0) # check args setupLogging(options,description) if options.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if options.taxdir is not None: taxonomy = readTaxonomy(options.taxdir, namesMap=True) else: taxonomy = None group1Map=getGroupMap(options.group1,taxonomy) group2Map=getGroupMap(options.group2,taxonomy) logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False))) if group2Map is not None: logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False))) # map reads to hits if options.parseStyle==GIS: keyType=int else: keyType=None accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType) # set up some function pointers global hitRE hitRE=parsingREs.get(options.parseStyle,None) if options.parseStyle == ORGS: getTaxid=_getOrgTaxid elif options.parseStyle == HITID: getTaxid=_getHitidTaxid elif options.parseStyle == HITDESC: getTaxid=_getHitdescTaxid else: getTaxid=_getExprTaxid # for filtering: filterParams = FilterParams.createFromOptions(options) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle,outhandle) in inputIterator(args,options): readCount=0 goodReadCount=0 printCount=0 # parse file for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False): readCount+=1 bestScore=0 hitTaxids={} for hit in hits: score=hit.score taxids=[] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit,accToTaxMap,taxonomy): if taxid is None: break if group2Map is not None and not group2Map.get(taxid,False): break taxids.append(taxid) if len(taxids)==0: # nothing matched in the wider group break hitTaxids[hit]=taxids # find the top score if score>bestScore: bestScore=score else: # if we get here, then every hit was in wider taxon list logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore)) all=True recognized=[] for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct): if _anyTaxidInGroup(taxids,group1Map): logging.debug("%s (%r) is in group 1" % (hit,taxids)) recognized.append(hit) else: logging.debug("%s (%r) is not in group 1" % (hit,taxids)) all=False if len(recognized)==0: # if none of the best are in our target list, next read logging.debug("No best hits for %s are in group 1" % (read)) continue if (not options.any) and (not all): # next read unless user said any or all hits are in list logging.debug("Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount+=1 if options.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write('\n') else: logging.debug("Keeping %d hits for %s" % (len(recognized),read)) for hit in sorted(recognized,key=lambda h: (h.score,h.hit)): outhandle.write(hit.getLine(filterParams)) printCount+=1 if options.reads: logging.info("Printed %d of %d reads" % (goodReadCount,readCount)) else: logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))
def pickBestHitByAbundance(m8stream, filterParams=None, return_lines=True, return_translations=False, organismCounts=None, winnerTakeAll=False, sequenceWeights=None, **kwargs): """ Given a hit table with (potentially) multiple hits for each read. Select the best hit for each read. Hits are parsed from given hit table (m8stream) if given a FilterParams object, otherwise it is assumed that m8stream is an iterator over Hit objects. Remaining keyword arguments are used to translate hits to accessions, organisms, or anything else using a HitTranslator. Ambiguous hits (multiple 'best' hits to one read) are resolved as follows: given a set of reads that all hit the same list of translated hits: divvy up reads so that the abundance ratios change minimally Abundance is recorded for whatever the Hittranslator returns. If a hit map and taxonomy are given, this will be organisms, if only the parseStyle is given and it's set to ACC, then accessions will be the currency. The default is HITID. Yields (read,hit) tuples, (read, [translated hits]) tuples, or hit table lines. """ if return_lines and return_translations: return_lines = False logger.warn("return_translations overrides return_lines!") # filtered hits if filterParams is None: hitIter = m8stream else: hitIter = filterM8Stream(m8stream, filterParams, return_lines=False) # custom function for pulling orgs from hits # if no settings given, just use the hit ID as the 'organism' kwargs.setdefault("parseStyle", HITID) hitTranslator = getHitTranslator(**kwargs) # we need to keep track of lots of things orgCounts = {} totalReads = 0 unambiguousReads = 0 ambiguousReads = 0 sameOrgCount = 0 ambiguousHits = {} # Check to see if organism counts were given if organismCounts is not None: if isinstance(organismCounts, str): organismCounts = getOrganismCountsFromFile(organismCounts) # loop over hits and yield unambiguous ones # Save ambiguous hits and org abundances logger.debug(str(hitIter)) for (read, hits) in hitIter: logger.debug("Read: %s" % (read)) totalReads += 1 hitByOrg = {} orgs = [] count = 0 for hit in hits: count += 1 hitOrgs = hitTranslator.translateHit(hit) logger.debug("Hit: %s (%s), %s" % (hit.hit, hitOrgs, hit.score)) orgs.extend(hitOrgs) for org in hitOrgs: if org in hitByOrg: # This should be REALLY rare. sameOrgCount += 1 sameOrgExample = (read, hit.hit, org) logger.warn( "Read (%s) has two best hits to same org (%s)!" % (read, org)) # always keep the first alphabetically, for reproducibility if hit.hit < hitByOrg[org].hit: hitByOrg[org] = hit else: hitByOrg[org] = hit orgs = tuple(sorted(set(orgs))) if count == 0: # This *should* never happen logger.error("No hits for %s!!!!!" % (read)) raise Exception("Read (%s) has not hits. This shouldn't happen." % (read)) elif count == 1 or len(hitByOrg) == 1: logger.debug("Read is UNambiguous") unambiguousReads += 1 for org in orgs: if sequenceWeights is not None: increment = sequenceWeights.get(read, 1) else: increment = 1 orgCounts[org] = orgCounts.get(org, 0) + increment if return_lines: yield hit.line elif return_translations: yield (read, orgs) else: yield (read, hit) else: logger.debug("Read IS ambiguous") ambiguousReads += 1 if organismCounts is None: # If we don't have count data to start, save these til the end ambiguousHits.setdefault(orgs, []).append(hitByOrg) else: # Use given counts to resolve for (hit, org) in assignHits(orgs, [ hitByOrg, ], organismCounts, winnerTakeAll): yield formatReturn(hit, org, return_lines, return_translations) logger.info("Processed %d reads:" % (totalReads)) logger.info("Collected unambiguous counts for %d orgs from %d reads" % (len(orgCounts), unambiguousReads)) # if we used given organism counts, then we are done if organismCounts is not None: return # otherwise, we have ambiguous reads to resolve logger.info("Need to resolve %d ambiguous reads hitting %d orgs" % (ambiguousReads, len(ambiguousHits))) if sameOrgCount > 0: elements = list(sameOrgExample) elements.insert(0, sameOrgCount) logger.warn("found %d cases where a read had an extra hit to the same " "organism. For Example: %s (%s,%s)" % tuple(elements)) # loop over ambiguous hits (grouped by possible orgs) and pick one for # each read ambiguousReads = 0 # for orgs, hits in ambiguousHits.items(): for orgs in sorted(ambiguousHits.keys()): hits = ambiguousHits[orgs] for (hit, org) in assignHits(orgs, hits, orgCounts, winnerTakeAll): ambiguousReads += 1 yield formatReturn(hit, org, return_lines, return_translations) logger.info("Selected top hit for %d ambiguous reads for a total of %d " "returned hit assignments" % (ambiguousReads, ambiguousReads + unambiguousReads))
def pickBestHitByAbundance(m8stream, filterParams=None, returnLines=True, returnTranslations=False, organismCounts=None, winnerTakeAll=False, sequenceWeights=None, **kwargs): """ Given a hit table with (potentially) multiple hits for each read. Select the best hit for each read. Hits are parsed from given hit table (m8stream) if given a FilterParams object, otherwise it is assumed that m8stream is an iterator over Hit objects. Remaining keyword arguments are used to translate hits to accessions, organisms, or anything else using a HitTranslator. Ambiguous hits (multiple 'best' hits to one read) are resolved as follows: given a set of reads that all hit the same list of translated hits: divvy up reads so that the abundance ratios change minimally Abundance is recorded for whatever the Hittranslator returns. If a hit map and taxonomy are given, this will be organisms, if only the parseStyle is given and it's set to ACC, then accessions will be the currency. The default is HITID. Yields (read,hit) tuples, (read, [translated hits]) tuples, or hit table lines. """ if returnLines and returnTranslations: returnLines = False logger.warn("returnTranslations overrides returnLines!") # filtered hits if filterParams is None: hitIter = m8stream else: hitIter = blastm8.filterM8Stream( m8stream, filterParams, returnLines=False) # custom function for pulling orgs from hits # if no settings given, just use the hit ID as the 'organism' kwargs.setdefault("parseStyle", HITID) hitTranslator = getHitTranslator(**kwargs) # we need to keep track of lots of things orgCounts = {} totalReads = 0 unambiguousReads = 0 ambiguousReads = 0 sameOrgCount = 0 ambiguousHits = {} # Check to see if organism counts were given if organismCounts is not None: if isinstance(organismCounts, str): organismCounts = getOrganismCountsFromFile(organismCounts) # loop over hits and yield unambiguous ones # Save ambiguous hits and org abundances logger.debug(str(hitIter)) for (read, hits) in hitIter: logger.debug("Read: %s" % (read)) totalReads += 1 hitByOrg = {} orgs = [] count = 0 for hit in hits: count += 1 hitOrgs = hitTranslator.translateHit(hit) logger.debug("Hit: %s (%s), %s" % (hit.hit, hitOrgs, hit.score)) orgs.extend(hitOrgs) for org in hitOrgs: if org in hitByOrg: # This should be REALLY rare. sameOrgCount += 1 sameOrgExample = (read, hit.hit, org) logger.warn( "Read (%s) has two best hits to same org (%s)!" % (read, org)) # always keep the first alphabetically, for reproducibility if hit.hit < hitByOrg[org].hit: hitByOrg[org] = hit else: hitByOrg[org] = hit orgs = tuple(sorted(set(orgs))) if count == 0: # This *should* never happen logger.error("No hits for %s!!!!!" % (read)) raise Exception( "Read (%s) has not hits. This shouldn't happen." % (read)) elif count == 1 or len(hitByOrg) == 1: logger.debug("Read is UNambiguous") unambiguousReads += 1 for org in orgs: if sequenceWeights is not None: increment = sequenceWeights.get(read, 1) else: increment = 1 orgCounts[org] = orgCounts.get(org, 0) + increment if returnLines: yield hit.line elif returnTranslations: yield (read, orgs) else: yield (read, hit) else: logger.debug("Read IS ambiguous") ambiguousReads += 1 if organismCounts is None: # If we don't have count data to start, save these til the end ambiguousHits.setdefault(orgs, []).append(hitByOrg) else: # Use given counts to resolve for ( hit, org) in assignHits( orgs, [ hitByOrg, ], organismCounts, winnerTakeAll): yield formatReturn(hit, org, returnLines, returnTranslations) logger.info("Processed %d reads:" % (totalReads)) logger.info( "Collected unambiguous counts for %d orgs from %d reads" % (len(orgCounts), unambiguousReads)) # if we used given organism counts, then we are done if organismCounts is not None: return # otherwise, we have ambiguous reads to resolve logger.info( "Need to resolve %d ambiguous reads hitting %d orgs" % (ambiguousReads, len(ambiguousHits))) if sameOrgCount > 0: elements = list(sameOrgExample) elements.insert(0, sameOrgCount) logger.warn( "found %d cases where a read had an extra hit to the same " "organism. For Example: %s (%s,%s)" % tuple(elements)) # loop over ambiguous hits (grouped by possible orgs) and pick one for # each read ambiguousReads = 0 # for orgs, hits in ambiguousHits.items(): for orgs in sorted(ambiguousHits.keys()): hits = ambiguousHits[orgs] for (hit, org) in assignHits(orgs, hits, orgCounts, winnerTakeAll): ambiguousReads += 1 yield formatReturn(hit, org, returnLines, returnTranslations) logger.info( "Selected top hit for %d ambiguous reads for a total of %d " "returned hit assignments" % (ambiguousReads, ambiguousReads + unambiguousReads))
def main(): # command line arguments parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, conflict_handler='resolve') # default to non-overlapping=0 add_hit_table_arguments(parser, flags='all', defaults={'nonoverlapping': 0}) parser.add_argument("-o", "--outfilenome", dest="outfilename", default=None, metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.") parser.add_argument('hit_table', nargs='?', type=argparse.FileType('rU'), default=sys.stdin, help="Table of search results to be filtered. " "If absent, data will be read from STDIN") add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # output file or STDOUT if arguments.outfilename is not None: logging.info("Writing data to %s" % (arguments.outfilename)) outfile_handle = open(arguments.outfilename, 'w') else: logging.info("writing data to STDOUT") outfile_handle = sys.stdout # input file or STDIN (handled by argparse) infile_handle = arguments.hit_table logging.info("reading data from %s" % (infile_handle.name)) # filter, but don't apply nonoverlapping yet # non-overlapping should be applied per-reference only params = FilterParams.create_from_arguments(arguments) # save user supplied value for later overlap_buffer = params.nonoverlapping # turn off for now params.set_nonoverlapping(-1) # merge hit_iter = filterM8Stream(infile_handle, params, return_lines=False) for query, query_hits in hit_iter: # group by reference hit hits_by_ref = defaultdict(list) for hit in query_hits: hits_by_ref[hit.hit].append(hit) # one output for query/reference pair for ref, ref_hits in hits_by_ref.items(): # remove overlaps unless the buffer has been set to <0 if overlap_buffer >= 0: ref_hits = remove_overlapping_hits( ref_hits, on_hit=True, buffer=params.nonoverlapping) ref_hits = remove_overlapping_hits( ref_hits, on_hit=False, buffer=params.nonoverlapping) # aggregate values length, score, identities = 0, 0, 0 for hit in ref_hits: length += hit.mlen score += hit.score try: # this will be off by 100x identities += hit.pctid * hit.mlen except: # just report pctid=0 if no pctid column in input pass outfile_handle.write( "%s\t%s\t%d\t%d\t%0.2f\n" % (query, ref, length, score, identities / length)) outfile_handle.close() infile_handle.close()
def main(): description = """ Given two lists of taxids and one or more hit tables, identify reads that: (1) have their best hits in taxid list 1 (2) have all other hits in either list Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed. The countMethod (-C) option is not used. """ parser = argparse.ArgumentParser(description=description) add_IO_arguments(parser) add_taxon_arguments( parser, defaults={ 'mapFile': None, 'parseStyle': ACCS, 'filter_top_pct': -1, 'countMethod': 'all', 'taxdir': None}) parser.add_argument( "-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append', help="Taxon to identify reads in. Top hits (as defined by " "--topHitPct) must be in this group. It can be a taxid, " "a name, or a file listing taxids. Use multiple times to " "specify a list of organisms. Use -a to specify whether " "all or at least one of the top hits must match.") parser.add_argument( "-a", "--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism " "in the target taxon/taxa. By default, all top hits must be " "in the target group.") parser.add_argument( '-t', '--topHitPct', default=0, type=float, help="How close(as a percentage to the best score a hit must be " "to qualify as a top hit. Default is 0, ie must have the best " "score. Use 100 to get all hits.") parser.add_argument( "-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append", help="Broader taxon to limit reads. All hits (use -F to limit " "these hits) must be in the target group or this group. Again, " "it can be a taxid, a name, or a file listing taxids. " "It can also be inkoved multiple times to choose multiple " "groups.") parser.add_argument( '-r', '--reads', default=False, action="store_true", help="Output just read names. By default, print the relevant hit " "lines for each read") # log level and help add_universal_arguments(parser) arguments = parser.parse_args() setup_logging(arguments) # check args if arguments.group1 is None: parser.error("Please use -g to specify a target taxonomic group") if arguments.taxdir is not None: taxonomy = readTaxonomy(arguments.taxdir, namesMap=True) else: taxonomy = None group_1_set = get_group_set(arguments.group1, taxonomy) group_2_set = get_group_set(arguments.group2, taxonomy) logging.debug( "Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set)) if group_2_set is not None: logging.debug( "Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set)) # map reads to hits if arguments.parseStyle == GIS: keyType = int else: keyType = None accToTaxMap = parseMapFile( arguments.mapFile, valueType=int, keyType=keyType) # set up some function pointers global hitRE hitRE = parsingREs.get(arguments.parseStyle, None) if arguments.parseStyle == ORGS: getTaxid = _getOrgTaxid elif arguments.parseStyle == HITID: getTaxid = _getHitidTaxid elif arguments.parseStyle == HITDESC: getTaxid = _getHitdescTaxid else: getTaxid = _getExprTaxid # for filtering: filterParams = FilterParams.create_from_arguments(arguments) logging.debug(repr(filterParams)) # loop over hit tables for (inhandle, outhandle) in inputIterator(arguments): readCount = 0 goodReadCount = 0 printCount = 0 # parse file for ( read, hits) in filterM8Stream( inhandle, filterParams, return_lines=False): readCount += 1 bestScore = 0 hitTaxids = {} for hit in hits: score = hit.score taxids = [] # does this hit have at least one associated taxid in group2? for taxid in getTaxid(hit, accToTaxMap, taxonomy): if taxid is None: break if group_2_set is not None and taxid not in group_2_set: break taxids.append(taxid) if len(taxids) == 0: # nothing matched in the wider group break hitTaxids[hit] = taxids # find the top score if score > bestScore: bestScore = score else: # if we get here, then every hit was in wider taxon list logging.debug( "Checking best hits for %s (top score: %.1f)" % (read, bestScore)) all = True recognized = [] for hit, taxids in _getBestHitTaxids( hitTaxids, bestScore, arguments.topHitPct): if _anyTaxidInGroup(taxids, group_1_set): logging.debug("%s (%r) is in group 1" % (hit, taxids)) recognized.append(hit) else: logging.debug( "%s (%r) is not in group 1" % (hit, taxids)) all = False if len(recognized) == 0: # if none of the best are in our target list, next read logging.debug( "No best hits for %s are in group 1" % (read)) continue if (not arguments.any) and (not all): # next read unless user said any or all hits are in list logging.debug( "Not all best hits for %s are in group 1" % (read)) continue # if we get here, then the read is a match goodReadCount += 1 if arguments.reads: logging.debug("Keeping %s" % (read)) outhandle.write(read) outhandle.write('\n') else: logging.debug( "Keeping %d hits for %s" % (len(recognized), read)) for hit in sorted( recognized, key=lambda h: ( h.score, h.hit)): outhandle.write(hit.getLine(filterParams)) printCount += 1 if arguments.reads: logging.info("Printed %d of %d reads" % (goodReadCount, readCount)) else: logging.info( "Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))