Python filterM8Stream Examples, edl.blastm8.filterM8Stream Python Examples

Example #1

0

Show file

File: hits.py Project: Piplopp/py-metagenomics

def parseM8FileIter(
        inhandle,
        hitStringMap,
        format,
        scorePct,
        parsingStyle,
        countMethod,
        taxonomy=None,
        rank=None,
        ignoreEmptyHits=True,
        sortReads=False):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # check filtering options
    if countMethod == 'first':
        scorePct = -1

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    options = FilterParams()
    options.format = format
    if scorePct >= 0 or sortReads:
        # filter hits on score if requested
        if scorePct >= 0:
            logger.info(
                "Filtering for scores within %s pct of best" %
                scorePct)
            options.topPct = scorePct
            options.sort = 'score'
        options.sortReads = sortReads
        # filters and parses
    options.parseStyle = parsingStyle
    hitIter = filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(
        hitIter,
        hitStringMap=hitStringMap,
        parseStyle=parsingStyle,
        taxonomy=taxonomy,
        rank=rank)

    # apply count method
    hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter

Example #2

0

Show file

File: hits.py Project: jmeppley/py-metagenomics

def parseAndFilterM8Stream(inhandle, options):
    """
    runs the input stream through m8 filtering
    and then through parseM8Hits to get map from each read to all hits
    """
    inhandle = filterM8Stream(inhandle, options, returnLines=False)
    logger.info("Parsing hits")

    # since filter already parses hits, use that info
    infoInDescription = options.parseStyle in [KEGG, ORGS, PFAM]
    return parseM8Hits(inhandle, infoInDescription)

Example #3

0

Show file

File: hits.py Project: jmeppley/py-metagenomics

def parseAndFilterM8Stream(inhandle, options):
    """
    runs the input stream through m8 filtering
    and then through parseM8Hits to get map from each read to all hits
    """
    inhandle = filterM8Stream(inhandle, options, return_lines=False)
    logger.info("Parsing hits")

    # since filter already parses hits, use that info
    infoInDescription = options.parseStyle in [KEGG, ORGS, PFAM]
    return parseM8Hits(inhandle, infoInDescription)

Example #4

0

Show file

File: get_sequences_from_m8.py Project: jmeppley/py-metagenomics

def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    with InputFile(blastFile) as m8stream:
        params = FilterParams.create_from_arguments(options)
        hitcount = 0
        readcount = 0
        keepcount = 0
        for (read, hits) in filterM8Stream(m8stream, params,
                                           return_lines=False):
            readcount += 1
            hitTuples = []
            for hit in hits:
                hitcount += 1
                if abs(hit.qstart - hit.qend) + 1 < minLength:
                    continue

                keepcount += 1
                if hit.format == GFF:
                    annot = "# %d # %d # %s # %s;evalue=%s" % \
                            (hit.qstart, hit.qend,
                             hit.strand, hit.hitDesc, hit.evalue)
                else:
                    try:
                        annot = "%s [%d,%d] %0.1f%% %d bits" % \
                                (hit.hit, hit.hstart, hit.hend,
                                 hit.pctid, hit.score)
                    except AttributeError:
                        annot = "%s [%d,%d] score: %d" % (
                            hit.hit, hit.hstart, hit.hend, hit.score)

                if hit.format == GFF:
                    reverse = hit.strand != "+"
                else:
                    reverse = hit.hstart > hit.hend

                if reverse:
                    # reverse if hit is backwards
                    hitTuples.append((hit.qend, hit.qstart, annot))
                else:
                    hitTuples.append((hit.qstart, hit.qend, annot))
            hitMap[read] = hitTuples

        logging.debug(
            "Kept %d of %d hits from %d lines to %d reads" %
            (keepcount, hitcount, m8stream.lines, readcount))
    return hitMap

Example #5

0

Show file

File: assembly.py Project: jmeppley/py-metagenomics

def getSequenceHits(hitsFile, params):
    """
    build a map from sequences to their hits
    """
    sequenceHits = {}
    hitCount = 0
    with InputFile(hitsFile) as m8stream:
        for seqid, hits in filterM8Stream(m8stream, params,
                                          return_lines=False):
            if len(hits) == 0:
                continue
            hitCount += len(hits)
            sequenceHits[seqid] = hits
        logging.debug("Parsed %d hits for %d sequences fromm %d lines" %
                      (hitCount, len(sequenceHits), m8stream.lines))
    return sequenceHits

Example #6

0

Show file

File: hits.py Project: JessAwBryant/py-metagenomics

def parseM8FileIter(inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # check filtering options
    if countMethod == 'first':
        scorePct=-1

    # setup some variables
    infoInDescription = parsingStyle in [KEGG,ORGS]

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    options=FilterParams()
    options.format=format
    if scorePct >= 0 or sortReads:
        # filter hits on score if requested
        if scorePct>=0:
            logger.info("Filtering for scores within %s pct of best" % scorePct)
            options.topPct=scorePct
            options.sort='score'
        options.sortReads=sortReads
        # filters and parses
    options.parseStyle=parsingStyle
    hitIter=filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank)

    #debugKey="F4UZ9WW02HMBZJ"
    #logger.debug("Hits for %s: %r" % (debugKey,hitMap[debugKey]))

    # apply count method
    hitIter=applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter

Example #7

0

Show file

def loadHitRegions(blastFile, minLength, options):
    """
    Parse a hit table into a map from read names to lists of (start,end,annot)
    """
    hitMap = {}
    params = FilterParams.create_from_arguments(options)
    m8stream = M8Stream(blastFile)
    hitcount = 0
    readcount = 0
    keepcount = 0
    for (read, hits) in filterM8Stream(m8stream, params, returnLines=False):
        readcount += 1
        hitTuples = []
        for hit in hits:
            hitcount += 1
            if abs(hit.qstart - hit.qend) + 1 < minLength:
                continue

            keepcount += 1
            if hit.format == GFF:
                annot = "# %d # %d # %s # %s;evalue=%s" % (
                    hit.qstart, hit.qend, hit.strand, hit.hitDesc, hit.evalue)
            else:
                try:
                    annot = "%s [%d,%d] %0.1f%% %d bits" % (
                        hit.hit, hit.hstart, hit.hend, hit.pctid, hit.score)
                except AttributeError:
                    annot = "%s [%d,%d] score: %d" % (
                        hit.hit, hit.hstart, hit.hend, hit.score)

            if hit.format == GFF:
                reverse = hit.strand != "+"
            else:
                reverse = hit.hstart > hit.hend

            if reverse:
                # reverse if hit is backwards
                hitTuples.append((hit.qend, hit.qstart, annot))
            else:
                hitTuples.append((hit.qstart, hit.qend, annot))
        hitMap[read] = hitTuples

    logging.debug(
        "Kept %d of %d hits to %d reads" %
        (keepcount, hitcount, readcount))
    return hitMap

Example #8

0

Show file

File: hits.py Project: jmeppley/py-metagenomics

def parseM8FileIter(inhandle,
                    hitStringMap,
                    options,
                    parsingStyle,
                    countMethod,
                    taxonomy=None,
                    rank=None,
                    ignoreEmptyHits=True,
                   ):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    # filters and parses
    # options.parseStyle = parsingStyle
    hitIter = filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(
        hitIter,
        hitStringMap=hitStringMap,
        parseStyle=parsingStyle,
        taxonomy=taxonomy,
        rank=rank)

    # apply count method
    hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter

Example #9

0

Show file

File: hits.py Project: jmeppley/py-metagenomics

def parseM8FileIter(
    inhandle,
    hitStringMap,
    options,
    parsingStyle,
    countMethod,
    taxonomy=None,
    rank=None,
    ignoreEmptyHits=True,
):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    # filters and parses
    # options.parseStyle = parsingStyle
    hitIter = filterM8Stream(inhandle, options, return_lines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(hitIter,
                          hitStringMap=hitStringMap,
                          parseStyle=parsingStyle,
                          taxonomy=taxonomy,
                          rank=rank)

    # apply count method
    hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter

Example #10

0

Show file

File: identify_reads.py Project: jmeppley/py-metagenomics

def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None}
    )
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action="append",
        help="Taxon to identify reads in. Top hits (as defined by "
        "--topHitPct) must be in this group. It can be a taxid, "
        "a name, or a file listing taxids. Use multiple times to "
        "specify a list of organisms. Use -a to specify whether "
        "all or at least one of the top hits must match.",
    )
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
        "in the target taxon/taxa. By default, all top hits must be "
        "in the target group.",
    )
    parser.add_argument(
        "-t",
        "--topHitPct",
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
        "to qualify as a top hit. Default is 0, ie must have the best "
        "score. Use 100 to get all hits.",
    )
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
        "these hits) must be in the target group or this group. Again, "
        "it can be a taxid, a name, or a file listing taxids. "
        "It can also be inkoved multiple times to choose multiple "
        "groups.",
    )
    parser.add_argument(
        "-r",
        "--reads",
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit " "lines for each read",
    )

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write("\n")
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized), read))
                    for hit in sorted(recognized, key=lambda h: (h.score, h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))

Example #11

0

Show file

File: identifyReads.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES"
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None})
    parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append',
                      help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.")
    parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.")
    addUniversalOptions(parser)
    parser.add_option('-t','--topHitPct', default=0, type='float',
                      help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.')
    parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append",
                      help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.")
    parser.add_option('-r','--reads', default=False, action="store_true",
                      help="Output just read names. By default, print the relevant hit lines for each read")

    (options, args) = parser.parse_args()

    if options.about:
        print description
        exit(0)

    # check args
    setupLogging(options,description)
    if options.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if options.taxdir is not None:
        taxonomy = readTaxonomy(options.taxdir, namesMap=True)
    else:
        taxonomy = None

    group1Map=getGroupMap(options.group1,taxonomy)
    group2Map=getGroupMap(options.group2,taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False)))
    if group2Map is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False)))

    # map reads to hits
    if options.parseStyle==GIS:
        keyType=int
    else:
        keyType=None
    accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE=parsingREs.get(options.parseStyle,None)
    if options.parseStyle == ORGS:
        getTaxid=_getOrgTaxid
    elif options.parseStyle == HITID:
        getTaxid=_getHitidTaxid
    elif options.parseStyle == HITDESC:
        getTaxid=_getHitdescTaxid
    else:
        getTaxid=_getExprTaxid

    # for filtering:
    filterParams = FilterParams.createFromOptions(options)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle,outhandle) in inputIterator(args,options):
        readCount=0
        goodReadCount=0
        printCount=0

        # parse file
        for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount+=1
            bestScore=0
            hitTaxids={}
            for hit in hits:
                score=hit.score
                taxids=[]
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit,accToTaxMap,taxonomy):
                    if taxid is None:
                        break
                    if group2Map is not None and not group2Map.get(taxid,False):
                        break
                    taxids.append(taxid)
                if len(taxids)==0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit]=taxids

                # find the top score
                if score>bestScore:
                    bestScore=score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore))
                all=True
                recognized=[]
                for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct):
                    if _anyTaxidInGroup(taxids,group1Map):
                        logging.debug("%s (%r)  is in group 1" % (hit,taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit,taxids))
                        all=False
                if len(recognized)==0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not options.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount+=1
                if options.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized),read))
                    for hit in sorted(recognized,key=lambda h: (h.score,h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount+=1

        if options.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount,readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))

Example #12

0

Show file

def pickBestHitByAbundance(m8stream,
                           filterParams=None,
                           return_lines=True,
                           return_translations=False,
                           organismCounts=None,
                           winnerTakeAll=False,
                           sequenceWeights=None,
                           **kwargs):
    """
    Given a hit table with (potentially) multiple hits for each read.
    Select the best hit for each read. Hits are parsed from given hit
    table (m8stream) if given a FilterParams object, otherwise it is
    assumed that m8stream is an iterator over Hit objects. Remaining
    keyword arguments are used to translate hits to accessions, organisms,
    or anything else using a HitTranslator.

    Ambiguous hits (multiple 'best' hits to one read) are resolved as follows:
        given a set of reads that all hit the same list of translated hits:
            divvy up reads so that the abundance ratios change minimally

    Abundance is recorded for whatever the Hittranslator returns. If a hit
    map and taxonomy are given, this will be organisms, if only the parseStyle
    is given and it's set to ACC, then accessions will be the currency. The
    default is HITID.

    Yields (read,hit) tuples, (read, [translated hits]) tuples, or hit
    table lines.
    """
    if return_lines and return_translations:
        return_lines = False
        logger.warn("return_translations overrides return_lines!")

    # filtered hits
    if filterParams is None:
        hitIter = m8stream
    else:
        hitIter = filterM8Stream(m8stream, filterParams, return_lines=False)

    # custom function for pulling orgs from hits
    #  if no settings given, just use the hit ID as the 'organism'
    kwargs.setdefault("parseStyle", HITID)
    hitTranslator = getHitTranslator(**kwargs)

    # we need to keep track of lots of things
    orgCounts = {}
    totalReads = 0
    unambiguousReads = 0
    ambiguousReads = 0
    sameOrgCount = 0
    ambiguousHits = {}

    # Check to see if organism counts were given
    if organismCounts is not None:
        if isinstance(organismCounts, str):
            organismCounts = getOrganismCountsFromFile(organismCounts)

    # loop over hits and yield unambiguous ones
    # Save ambiguous hits and org abundances
    logger.debug(str(hitIter))
    for (read, hits) in hitIter:
        logger.debug("Read: %s" % (read))
        totalReads += 1
        hitByOrg = {}
        orgs = []
        count = 0
        for hit in hits:
            count += 1
            hitOrgs = hitTranslator.translateHit(hit)
            logger.debug("Hit: %s (%s), %s" % (hit.hit, hitOrgs, hit.score))
            orgs.extend(hitOrgs)
            for org in hitOrgs:
                if org in hitByOrg:
                    # This should be REALLY rare.
                    sameOrgCount += 1
                    sameOrgExample = (read, hit.hit, org)
                    logger.warn(
                        "Read (%s) has two best hits to same org (%s)!" %
                        (read, org))
                    # always keep the first alphabetically, for reproducibility
                    if hit.hit < hitByOrg[org].hit:
                        hitByOrg[org] = hit
                else:
                    hitByOrg[org] = hit
        orgs = tuple(sorted(set(orgs)))
        if count == 0:
            # This *should* never happen
            logger.error("No hits for %s!!!!!" % (read))
            raise Exception("Read (%s) has not hits. This shouldn't happen." %
                            (read))
        elif count == 1 or len(hitByOrg) == 1:
            logger.debug("Read is UNambiguous")
            unambiguousReads += 1
            for org in orgs:
                if sequenceWeights is not None:
                    increment = sequenceWeights.get(read, 1)
                else:
                    increment = 1
                orgCounts[org] = orgCounts.get(org, 0) + increment
            if return_lines:
                yield hit.line
            elif return_translations:
                yield (read, orgs)
            else:
                yield (read, hit)
        else:
            logger.debug("Read IS ambiguous")
            ambiguousReads += 1
            if organismCounts is None:
                # If we don't have count data to start, save these til the end
                ambiguousHits.setdefault(orgs, []).append(hitByOrg)
            else:
                # Use given counts to resolve
                for (hit, org) in assignHits(orgs, [
                        hitByOrg,
                ], organismCounts, winnerTakeAll):
                    yield formatReturn(hit, org, return_lines,
                                       return_translations)

    logger.info("Processed %d reads:" % (totalReads))
    logger.info("Collected unambiguous counts for %d orgs from %d reads" %
                (len(orgCounts), unambiguousReads))

    # if we used given organism counts, then we are done
    if organismCounts is not None:
        return

    # otherwise, we have ambiguous reads to resolve
    logger.info("Need to resolve %d ambiguous reads hitting %d orgs" %
                (ambiguousReads, len(ambiguousHits)))

    if sameOrgCount > 0:
        elements = list(sameOrgExample)
        elements.insert(0, sameOrgCount)
        logger.warn("found %d cases where a read had an extra hit to the same "
                    "organism. For Example: %s (%s,%s)" % tuple(elements))

    # loop over ambiguous hits (grouped by possible orgs) and pick one for
    # each read
    ambiguousReads = 0
    # for orgs, hits in ambiguousHits.items():
    for orgs in sorted(ambiguousHits.keys()):
        hits = ambiguousHits[orgs]
        for (hit, org) in assignHits(orgs, hits, orgCounts, winnerTakeAll):
            ambiguousReads += 1
            yield formatReturn(hit, org, return_lines, return_translations)

    logger.info("Selected top hit for %d ambiguous reads for a total of %d "
                "returned hit assignments" %
                (ambiguousReads, ambiguousReads + unambiguousReads))

Example #13

0

Show file

File: redistribute.py Project: jmeppley/py-metagenomics

def pickBestHitByAbundance(m8stream,
                           filterParams=None,
                           returnLines=True,
                           returnTranslations=False,
                           organismCounts=None,
                           winnerTakeAll=False,
                           sequenceWeights=None,
                           **kwargs):
    """
    Given a hit table with (potentially) multiple hits for each read.
    Select the best hit for each read. Hits are parsed from given hit
    table (m8stream) if given a FilterParams object, otherwise it is
    assumed that m8stream is an iterator over Hit objects. Remaining
    keyword arguments are used to translate hits to accessions, organisms,
    or anything else using a HitTranslator.

    Ambiguous hits (multiple 'best' hits to one read) are resolved as follows:
        given a set of reads that all hit the same list of translated hits:
            divvy up reads so that the abundance ratios change minimally

    Abundance is recorded for whatever the Hittranslator returns. If a hit
    map and taxonomy are given, this will be organisms, if only the parseStyle
    is given and it's set to ACC, then accessions will be the currency. The
    default is HITID.

    Yields (read,hit) tuples, (read, [translated hits]) tuples, or hit
    table lines.
    """
    if returnLines and returnTranslations:
        returnLines = False
        logger.warn("returnTranslations overrides returnLines!")

    # filtered hits
    if filterParams is None:
        hitIter = m8stream
    else:
        hitIter = blastm8.filterM8Stream(
            m8stream, filterParams, returnLines=False)

    # custom function for pulling orgs from hits
    #  if no settings given, just use the hit ID as the 'organism'
    kwargs.setdefault("parseStyle", HITID)
    hitTranslator = getHitTranslator(**kwargs)

    # we need to keep track of lots of things
    orgCounts = {}
    totalReads = 0
    unambiguousReads = 0
    ambiguousReads = 0
    sameOrgCount = 0
    ambiguousHits = {}

    # Check to see if organism counts were given
    if organismCounts is not None:
        if isinstance(organismCounts, str):
            organismCounts = getOrganismCountsFromFile(organismCounts)

    # loop over hits and yield unambiguous ones
    # Save ambiguous hits and org abundances
    logger.debug(str(hitIter))
    for (read, hits) in hitIter:
        logger.debug("Read: %s" % (read))
        totalReads += 1
        hitByOrg = {}
        orgs = []
        count = 0
        for hit in hits:
            count += 1
            hitOrgs = hitTranslator.translateHit(hit)
            logger.debug("Hit: %s (%s), %s" % (hit.hit, hitOrgs, hit.score))
            orgs.extend(hitOrgs)
            for org in hitOrgs:
                if org in hitByOrg:
                    # This should be REALLY rare.
                    sameOrgCount += 1
                    sameOrgExample = (read, hit.hit, org)
                    logger.warn(
                        "Read (%s) has two best hits to same org (%s)!" %
                        (read, org))
                    # always keep the first alphabetically, for reproducibility
                    if hit.hit < hitByOrg[org].hit:
                        hitByOrg[org] = hit
                else:
                    hitByOrg[org] = hit
        orgs = tuple(sorted(set(orgs)))
        if count == 0:
            # This *should* never happen
            logger.error("No hits for %s!!!!!" % (read))
            raise Exception(
                "Read (%s) has not hits. This shouldn't happen." %
                (read))
        elif count == 1 or len(hitByOrg) == 1:
            logger.debug("Read is UNambiguous")
            unambiguousReads += 1
            for org in orgs:
                if sequenceWeights is not None:
                    increment = sequenceWeights.get(read, 1)
                else:
                    increment = 1
                orgCounts[org] = orgCounts.get(org, 0) + increment
            if returnLines:
                yield hit.line
            elif returnTranslations:
                yield (read, orgs)
            else:
                yield (read, hit)
        else:
            logger.debug("Read IS ambiguous")
            ambiguousReads += 1
            if organismCounts is None:
                # If we don't have count data to start, save these til the end
                ambiguousHits.setdefault(orgs, []).append(hitByOrg)
            else:
                # Use given counts to resolve
                for (
                    hit, org) in assignHits(
                    orgs, [
                        hitByOrg, ], organismCounts, winnerTakeAll):
                    yield formatReturn(hit,
                                       org,
                                       returnLines,
                                       returnTranslations)

    logger.info("Processed %d reads:" % (totalReads))
    logger.info(
        "Collected unambiguous counts for %d orgs from %d reads" %
        (len(orgCounts), unambiguousReads))

    # if we used given organism counts, then we are done
    if organismCounts is not None:
        return

    # otherwise, we have ambiguous reads to resolve
    logger.info(
        "Need to resolve %d ambiguous reads hitting %d orgs" %
        (ambiguousReads, len(ambiguousHits)))

    if sameOrgCount > 0:
        elements = list(sameOrgExample)
        elements.insert(0, sameOrgCount)
        logger.warn(
            "found %d cases where a read had an extra hit to the same "
            "organism. For Example: %s (%s,%s)" %
            tuple(elements))

    # loop over ambiguous hits (grouped by possible orgs) and pick one for
    # each read
    ambiguousReads = 0
    # for orgs, hits in ambiguousHits.items():
    for orgs in sorted(ambiguousHits.keys()):
        hits = ambiguousHits[orgs]
        for (hit, org) in assignHits(orgs, hits, orgCounts, winnerTakeAll):
            ambiguousReads += 1
            yield formatReturn(hit, org, returnLines, returnTranslations)

    logger.info(
        "Selected top hit for %d ambiguous reads for a total of %d "
        "returned hit assignments" %
        (ambiguousReads, ambiguousReads + unambiguousReads))

Example #14

0

Show file

def main():
    # command line arguments
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter,
        conflict_handler='resolve')

    # default to non-overlapping=0
    add_hit_table_arguments(parser,
                            flags='all',
                            defaults={'nonoverlapping': 0})
    parser.add_argument("-o",
                        "--outfilenome",
                        dest="outfilename",
                        default=None,
                        metavar="OUTFILENAME",
                        help="Write masked fasta output to OUTFILENAME.")
    parser.add_argument('hit_table',
                        nargs='?',
                        type=argparse.FileType('rU'),
                        default=sys.stdin,
                        help="Table of search results to be filtered. "
                        "If absent, data will be read from STDIN")

    add_universal_arguments(parser)

    arguments = parser.parse_args()

    setup_logging(arguments)

    # output file or STDOUT
    if arguments.outfilename is not None:
        logging.info("Writing data to %s" % (arguments.outfilename))
        outfile_handle = open(arguments.outfilename, 'w')
    else:
        logging.info("writing data to STDOUT")
        outfile_handle = sys.stdout

    # input file or STDIN (handled by argparse)
    infile_handle = arguments.hit_table
    logging.info("reading data from %s" % (infile_handle.name))

    # filter, but don't apply nonoverlapping yet
    # non-overlapping should be applied per-reference only
    params = FilterParams.create_from_arguments(arguments)
    # save user supplied value for later
    overlap_buffer = params.nonoverlapping
    # turn off for now
    params.set_nonoverlapping(-1)

    # merge
    hit_iter = filterM8Stream(infile_handle, params, return_lines=False)
    for query, query_hits in hit_iter:
        # group by reference hit
        hits_by_ref = defaultdict(list)
        for hit in query_hits:
            hits_by_ref[hit.hit].append(hit)

        # one output for query/reference pair
        for ref, ref_hits in hits_by_ref.items():

            # remove overlaps unless the buffer has been set to <0
            if overlap_buffer >= 0:
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=True, buffer=params.nonoverlapping)
                ref_hits = remove_overlapping_hits(
                    ref_hits, on_hit=False, buffer=params.nonoverlapping)

            # aggregate values
            length, score, identities = 0, 0, 0
            for hit in ref_hits:
                length += hit.mlen
                score += hit.score
                try:
                    # this will be off by 100x
                    identities += hit.pctid * hit.mlen
                except:
                    # just report pctid=0 if no pctid column in input
                    pass

            outfile_handle.write(
                "%s\t%s\t%d\t%d\t%0.2f\n" %
                (query, ref, length, score, identities / length))

    outfile_handle.close()
    infile_handle.close()

Example #15

0

Show file

File: identify_reads.py Project: jmeppley/py-metagenomics

def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'mapFile': None,
            'parseStyle': ACCS,
            'filter_top_pct': -1,
            'countMethod': 'all',
            'taxdir': None})
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action='append',
        help="Taxon to identify reads in. Top hits (as defined by "
             "--topHitPct) must be in this group. It can be a taxid, "
             "a name, or a file listing taxids. Use multiple times to "
             "specify a list of organisms. Use -a to specify whether "
             "all or at least one of the top hits must match.")
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
             "in the target taxon/taxa. By default, all top hits must be "
             "in the target group.")
    parser.add_argument(
        '-t',
        '--topHitPct',
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
             "to qualify as a top hit. Default is 0, ie must have the best "
             "score. Use 100 to get all hits.")
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
             "these hits) must be in the target group or this group. Again, "
             "it can be a taxid, a name, or a file listing taxids. "
             "It can also be inkoved multiple times to choose multiple "
             "groups.")
    parser.add_argument(
        '-r',
        '--reads',
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit "
             "lines for each read")

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug(
        "Group 1 has %d entries and 439482 in group1 is %s" %
        (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug(
            "Group 2 has %d entries and 439482 in group2 is %s" %
            (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(
        arguments.mapFile,
        valueType=int,
        keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (
                read,
                hits) in filterM8Stream(
                inhandle,
                filterParams,
                return_lines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug(
                    "Checking best hits for %s (top score: %.1f)" %
                    (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(
                        hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug(
                            "%s (%r) is not in group 1" %
                            (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug(
                        "No best hits for %s are in group 1" %
                        (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug(
                        "Not all best hits for %s are in group 1" %
                        (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug(
                        "Keeping %d hits for %s" %
                        (len(recognized), read))
                    for hit in sorted(
                        recognized,
                        key=lambda h: (
                            h.score,
                            h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info(
                "Printed %d lines for %d of %d reads" %
                (printCount, goodReadCount, readCount))