Example #1
0
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(
            taxonomy, namesMap=kwargs.pop(
                'namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
Example #2
0
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(taxonomy,
                                namesMap=kwargs.pop('namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
def buildSilvaTree(taxFile, fastaFile, logger):
    """
    Given a text taxonomy file (lineage <tab> id <tab> rank) and a fasta file with full lineages as the description:
    Return the root node from a taxonomy of edl.taxon.Node objects and a mapping from fasta record IDs to taxids.
    """
    rankMap=parseMapFile(taxFile, keyCol=0, valueCol=2, skipFirst=0)
    silvaTaxidMap=parseMapFile(taxFile, keyCol=0, valueCol=1, valueType=int, skipFirst=0)

    # create core of tree from taxonomy text file
    silvaTree={}
    maxTaxid=max(silvaTaxidMap.values())
    for (lineage, rank) in rankMap.items():
        node=edl.silva.SilvaTaxNode.addToTreeFromString(lineage.strip("; "), silvaTree)
        node.rank = rankMapping.get(rank,rank)
        node.ncbi_tax_id = silvaTaxidMap[lineage]
        if not isinstance(node.ncbi_tax_id,int):
            logger.warn("NCBI taxid is not an int: %s (%s)" % (node.ncbi_tax_id, node.name))

    logger.info("Built tree of %d taxa with the largest ID of %d" % (len(silvaTree),maxTaxid))

    # Add leaves to tree from lineages in fasta file and build mapping
    taxmap={}
    for (hitid,lineage) in getOrgsFromSilvaFasta(fastaFile):
        node = edl.silva.SilvaTaxNode.addToTreeFromString(lineage, silvaTree)
        taxmap[hitid]=node

    logger.info("Added nodes from fasta file for a total of %d" % (len(silvaTree)))

    rootNode=next(iter(silvaTree.values())).getRootNode()
    # make sure everything is OK
    for node in treeGenerator(rootNode):
        if not isinstance(node.id,int):
            if "ncbi_tax_id" in dir(node):
                node.id = int(node.ncbi_tax_id)
            else:
                maxTaxid+=1
                node.id=maxTaxid

    logger.info("Cleaning up taxmap")

    # change nodes in taxmap to IDs
    for hitid in taxmap:
        taxmap[hitid]=taxmap[hitid].id

    return (rootNode, taxmap)
Example #4
0
def loadSequenceWeights(weightFiles):
    """
    Load and merge list of sequence weight maps.
    """
    if len(weightFiles) > 0:
        sequenceWeights = {}
        for weightFile in weightFiles:
            sequenceWeights.update(parseMapFile(weightFiles, valueType=int))
    else:
        sequenceWeights = None
    return sequenceWeights
Example #5
0
def loadSequenceWeights(weightFiles):
    """
    Load and merge list of sequence weight maps.
    """
    if len(weightFiles) > 0:
        sequenceWeights = {}
        for weightFile in weightFiles:
            sequenceWeights.update(parseMapFile(weightFiles, valueType=int))
    else:
        sequenceWeights = None
    return sequenceWeights
Example #6
0
def readIDMap(options):
    """
    Load the specififed lookup table for hit IDs. If the parseStyle requested is 'gis', convert keys to integers. The values are always convereted to integeres since they are assumed to be taxids
    """
    # map reads to hits
    if options.parseStyle == GIS:
        keyType=int
    else:
        keyType=None
        if options.taxdir is not None:
            valueType=int
        else:
            valueType=None
    return parseMapFile(options.mapFile,valueType=valueType,keyType=keyType)
Example #7
0
def readIDMap(options):
    """
    Load the specififed lookup table for hit IDs. If the parseStyle
    requested is 'gis', convert keys to integers. The values are always
    convereted to integeres since they are assumed to be taxids
    """
    # map reads to hits
    if options.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
        if options.taxdir is not None:
            valueType = int
        else:
            valueType = None
    return parseMapFile(options.mapFile, valueType=valueType, keyType=keyType)
def main():
    usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]"
    description = """
Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level.
    This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines.
    This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3.
    If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. 
    NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). 
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    parser.add_option("-l", "--level", dest="levels", default=None,
                      metavar="LEVEL", action="append",
                      help=""" Level(s) to collect counts on. Use flag 
                      multiple times to specify multiple levels. If multiple 
                      values given, one table produced for each with rank 
                      name appended to file name. Levels can be an integer 
                      (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 
                      'ko', or 'ortholog' (which are all synonyms), or  
                      anything not synonymous with 'gene' to 
                      get CAZy groups. Defaults to ortholog/role and 
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_option('-s','--squash',dest='splitForLevels',
            default=True, action='store_false',
            help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.addPathOptions(parser)

    # log level and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    # Set defaults and check for some conflicts
    if options.levels is None and options.heirarchyFile is None:
        # using hit names only
        options.levels=[None]
    else:
        if options.heirarchyFile is None and options.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (options.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if options.levels is None:
            # set a default
            if options.heirarchyType is 'kegg':
                options.levels=['ko','1','2','pathway']
            if options.heirarchyType is 'seed':
                options.levels=['role','1','2','subsystem']
            else:
                options.levels=['gene','group']

        try:
            # Make sure the level list makes sense
            options.levels=cleanLevels(options.levels)
        except Exception as e:
            parser.error(str(e))

    # only print to stdout if there is a single input file
    if len(args)>1 and options.outfile is None:
        parser.error("STDOUT only works if a single input file is given!")


    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle == 'auto':
            with open(options.mapFile) as f:
                firstLine=f.next()
                while len(firstLine)==0 or firstLine[0]=='#':
                    firstLine=f.next()
            if koMapRE.search(firstLine):
                options.mapStyle='kegg'
            elif seedMapRE.search(firstLine):
                options.mapStyle='seed'
            elif tabMapRE.search(firstLine):
                options.mapStyle='tab'
            #elif cogMapRE.search(firstLine):
            #    options.mapStyle='cog'
            else:
                raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine))

        logging.info("Map file seems to be: %s" % (options.mapStyle))
        if options.mapStyle=='kegg':
            valueMap=kegg.parseLinkFile(options.mapFile)
        elif options.mapStyle=='seed':
            valueMap=kegg.parseSeedMap(options.mapFile)
        #elif options.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(options.mapFile)
        else:
            if options.parseStyle == hits.GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
        if len(valueMap)>0:
            logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next()))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap=None

    # set up level mapping
    levelMappers = [getLevelMapper(l,options) for l in options.levels]

    # parse input files
    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads)

        outhandle.write("Read\t%s\n" % ('\t'.join(options.levels)))
        for read, hitIter in hitMapIter:
            assignments=[]
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment=[]
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(assignment,options):
                    outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'mapFile': None,
            'parseStyle': ACCS,
            'filter_top_pct': -1,
            'countMethod': 'all',
            'taxdir': None})
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action='append',
        help="Taxon to identify reads in. Top hits (as defined by "
             "--topHitPct) must be in this group. It can be a taxid, "
             "a name, or a file listing taxids. Use multiple times to "
             "specify a list of organisms. Use -a to specify whether "
             "all or at least one of the top hits must match.")
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
             "in the target taxon/taxa. By default, all top hits must be "
             "in the target group.")
    parser.add_argument(
        '-t',
        '--topHitPct',
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
             "to qualify as a top hit. Default is 0, ie must have the best "
             "score. Use 100 to get all hits.")
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
             "these hits) must be in the target group or this group. Again, "
             "it can be a taxid, a name, or a file listing taxids. "
             "It can also be inkoved multiple times to choose multiple "
             "groups.")
    parser.add_argument(
        '-r',
        '--reads',
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit "
             "lines for each read")

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug(
        "Group 1 has %d entries and 439482 in group1 is %s" %
        (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug(
            "Group 2 has %d entries and 439482 in group2 is %s" %
            (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(
        arguments.mapFile,
        valueType=int,
        keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (
                read,
                hits) in filterM8Stream(
                inhandle,
                filterParams,
                return_lines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug(
                    "Checking best hits for %s (top score: %.1f)" %
                    (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(
                        hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug(
                            "%s (%r) is not in group 1" %
                            (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug(
                        "No best hits for %s are in group 1" %
                        (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug(
                        "Not all best hits for %s are in group 1" %
                        (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug(
                        "Keeping %d hits for %s" %
                        (len(recognized), read))
                    for hit in sorted(
                        recognized,
                        key=lambda h: (
                            h.score,
                            h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info(
                "Printed %d lines for %d of %d reads" %
                (printCount, goodReadCount, readCount))
Example #10
0
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-
        1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)
def main():
    """ set up the command line interface """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("-1", "--input_file_1",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_1"),
                        help="Input table 1")
    parser.add_argument("-2", "--input_file_2",
                        default=None,
                        type=argparse.FileType('r'),
                        metavar=("INPUT_TABLE_2"),
                        help="Input table 2")
    parser.add_argument("-m", "--multiplier",
                        default=None,
                        metavar=("MULTIPLIER_TABLE"),
                        help=("Table of values to multiply each sequence. "
                              "EG assembly coverages."))
    parser.add_argument("-T", "--total_reads",
                        default=0,
                        metavar="TOTAL_READS",
                        type=int,
                        help="Total number of reads to expect. (This allows "
                             "the reporting of unknown read count)")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        type=argparse.FileType('w'),
        default=sys.stdout,
        metavar="OUTFILE",
        help="Write count table to OUTFILE. (Defaults to STDOUT")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action="store_true",
        help="Print one number per row (prefixed by two keys) instead "
             "of a table with one seet of keys as column names and one "
             "set as row names.")
    parser.add_argument(
        "-H",
        "--hitCol1",
        dest="hitCol1",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 1 with hit name, -1 "
             "is default meaning all columns that are not the read name are "
             "hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-I",
        "--hitCol2",
        dest="hitCol2",
        type=int,
        default=-1,
        help="Index (starting at 0) of column in file 2 with hit name, -1 "
             "is default meaning all columns that are not the read name "
             "are hit names.",
        metavar="HITCOL")
    parser.add_argument(
        "-S",
        "--skipFirstRow",
        action="store_true",
        default=False,
        help="hit tables have a header row which needs to be skipped")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.input_file_1 is None or arguments.input_file_2 is None:
        parser.error("Please supply two input files")

    logging.info("reading hits from %s", arguments.input_file_1.name)
    hits1 = parseHits(arguments.input_file_1,
                      0,
                      arguments.hitCol1,
                      arguments.skipFirstRow,
                      None)
    logging.info("reading hits from %s", arguments.input_file_2.name)
    hits2 = parseHits(arguments.input_file_2,
                      0,
                      arguments.hitCol2,
                      arguments.skipFirstRow,
                      None)

    hits1 = tupleIteratorToMap(hits1)
    hits2 = tupleIteratorToMap(hits2)

    if arguments.multiplier is not None:
        multipliers = parseMapFile(arguments.multiplier, valueType=float)
    else:
        multipliers = None

    logging.info("counting hits")
    (table, cols) = combine_counts(hits1, hits2, multipliers,
                                   total_reads=arguments.total_reads)

    # print out hit table
    logging.info("printing table to " + arguments.outfile.name)
    print_table(arguments.outfile, table, cols,
                is_multiplied=multipliers is not None,
                long_output=arguments.long_output)
Example #12
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-s',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
             "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s" %
            (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            arguments.hitTableFormat,
            arguments.filterTopPct,
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write(
                        "%s\t%s\n" %
                        (read, "\t".join(assignmentList)))
Example #13
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files", nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o", "--outfile", dest="output_file",
                        metavar="OUTFILE", help="Write count table to OUTFILE")
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={'countMethod':
                 ('tophit',
                  'first',
                  'most',
                  'all',
                  'consensus')},
        helps={'countMethod':
               ("How to deal with counts from multiple hits. ('first': "
                "just use the first hit, 'most': "
                "can return multiple hits, 'all': return every hit, "
                "consensus: return None unless all the same). Do not "
                "use most or consensus with more than one level at a time. "
                "Default is 'tophit': This breaks any ties by choosing "
                "the most abundant hit based on other unambiguous "
                "assignments.")})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                valueDelim=arguments.tab_map_delim,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s",
                         len(valueMap), next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    params = FilterParams.create_from_arguments(arguments)
    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug(
                "READ: %s\t%s\t%s\t%s",
                file_tag, read_name, hit.hit, gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      params,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s",
                total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
def main():
    ## set up CLI
    description = """
    Usage: lastWrapper.py [OPTIONS] -o OUTPUT_FILE LASTDB INPUT_FILE

    The last argument is taken as the input file and is fragmented into chunks on a local disk. Each chunk is run through lastal with all the given options (except output is writeen to the local disk). The results of each are comcateneted into the requested output file (the -o argument).

    Breaks input file into fragments to run last in pseudo multithreaded state. All lastal options (except -n) are accepted. Run lastal -h to see them. Additional options modify the batch behavior and post-porocessing. By default, output is converted to blast m8 like (aka 'gene') format and grouped by read.

The input file may be fasta or fastq. Fasta files are fragmented using the ">" character. FastqFiles are assumed to have four lines per record.

    The recommeded lastal options for reproducing BLASTX results are -b 1 -x 15 -y 7 -z 25, and these are invoked by default if the -F flag is used. To use different values, set them explicitly. This script will also mask the input fasta or fastq using tantan and pass '-u 2' to lastal. If the reads are already masked or to disable, supply a value for -u to this script. You MUST speicfy a frameshift penalty with -F if the database is protein. 15 is a good value.

    Batch Behavior:
    -C CHUNK_SIZE               Set the number of reads per chunk (defers to -N)
    -N NUM_CHUNKS               Set the number of threads (defaults to 4)
    For detailed options for fragmenting fasta, run fragmentRecords.py -h

    Post Processing:
    -f FORMAT                   'gene' (the default), 'blast', or 'liz' for blast-like m8.
                                '0' or '1' for lastal formats
    -O                          Original order. By default, the tabular formats
                                (ie '0','gene','blast','liz') are grouped by read and
                                sorted by score within reads.
    -n HITS_PER_READ            Maximim number of hits per read to keep
                                Defaults to 10. Set to -1 to turn off.

    Hit Descriptions:
    Lastal does not return hit descriptions, just the ID string, but some formats have description columns (gene and liz). If the output format is one of these and if there is a DB.ids file (next to the DB.prj file), lastWrapper will use that file as a map from hit ids to descriptions.
    -d ID-TO-DESC-MAP            Map hit ids to descriptions using file
    -D                             Don't insert descriptions even if ids file present

    The lastal binary needs to be in your path. The same is true for tantan and sort, if those options are selected.

    Temporary files are created in a temporary location. This defaults to /localtmp if it exists and falls back to /tmp if not. You can set it with the option:
    -T TMP_DIR_ROOT            Directory in which to create temporary files

    Help/Info:
    -A, --about, -h, --help     This message

    """

    (options, args) = parseArgs()

    setupLogging(options, description, stream=sys.stdout)

    # Some basic checks
    if len(args) < 2:
        raise Exception("Command does not seem long enough for lastal!")

    # Get last argument as input file name
    infile = args.pop(-1)
    logging.info("Reading sequences from: " + infile)
    dbfile = args[-1]
    logging.info("Searching database: %s" % dbfile)
    outfile = options.outfile
    logging.info("Writing output to: %s " % outfile)

    # if options.verbose>1:
    #    args.insert(-1,'-v')
    if options.format == "1":
        if options.sort or options.maxHits > 0:
            logging.warn("Cannot sort or filter raw last output. Leaving untouched.")
    elif options.maxHits > 0:
        if not options.sort and options.format == "0":
            sys.exit("Cannot limit hits unless sorting or converting to M8 ('blast', 'gene', or 'liz')")

    # Apply any defaults not set by user
    if "-F" in options.userFlags:
        # this was a protein search
        defaultDict = protDefaults
    else:
        # this is a nucleotide search
        defaultDict = nuclDefaults
    for key in defaultDict:
        if key not in options.userFlags:
            args.insert(-1, key)
            args.insert(-1, defaultDict[key])

    # temporary file root
    if options.tmpDirRoot is None:
        if os.path.exists("/localtmp"):
            options.tmpDirRoot = "/localtmp"
        else:
            options.tmpDirRoot = "/tmp"

    ##
    # Fragment input file to temporary local dir
    if options.fastq:
        # fastq
        fileType = fileTypeMap["fastq"]
    else:
        fileType = fileTypeMap["fasta"]
    fragPref = "fragment"
    insuff = ".in"
    outsuff = ".out"

    # create local tmp dir
    localdir = tempfile.mkdtemp(suffix="lastWrapper", dir=options.tmpDirRoot)

    # make sure we know how big to make chunks
    if options.chunk is None:
        if options.splits is None:
            logging.info("Defaulting to 4 chunks")
            options.splits = 4
        options.chunk = getSizePerChunk(infile, options.splits, fileType, splitOnSize=options.splitOnSize)

    # mask with tantan unless mask is already set
    if options.mask is None:
        # add -u 2 to the command to use tantan results
        args.insert(-1, "-u")
        args.insert(-1, "2")
        # setup tantan command to pipe through fragmentInput
        command = [tantanBin, infile]
        logging.info("Masking with tantan")
        logging.debug(command)
        p = subprocess.Popen(command, stdout=subprocess.PIPE)
        instream = p.stdout
    else:
        # no masking (user has taken care of it)
        p = None
        instream = infile

    # fragment
    num = fragmentInputBySize(
        instream, localdir, options.chunk, fileType, fragPref, splitOnSize=options.splitOnSize, suffix=insuff
    )
    logging.info("Created %d fragments in %s" % (num, localdir))

    # check masking exit code if used
    if p is not None:
        ttCode = p.wait()
        if ttCode != 0:
            sys.exit("Tantan exited with code %d" % (ttCode))

    ## Run jobs
    # setup threads
    threads = []
    for i in range(num):
        inFrag = getFragmentPath(localdir, fragPref, i + 1, insuff)
        outFrag = getFragmentPath(localdir, fragPref, i + 1, outsuff)
        # clone argument list and create command for this fragment
        cmd = list(args)
        # if post processing is needed, change command to string and pipe
        logging.debug("Sort check: %r %r" % (options.sort, options.format))
        if options.format == "0" and (options.sort or options.maxHits > 0):
            cmd.append(inFrag)
            # sort (and possibly filter) last-formatted hit table
            cmd = "%s | %s" % (getCommandString(cmd), getSortCommand(options.sort, options.maxHits, options.tmpDirRoot))
            useShell = True
        elif options.format in ("blast", "gene", "liz"):
            cmd.append(inFrag)
            # convert to m8 (and sort)
            cmd = "%s | %s" % (
                getCommandString(cmd),
                getConvertCommand(options.format, options.sort, options.maxHits, options.tmpDirRoot),
            )
            useShell = True
        else:
            cmd.insert(-1, "-o")
            cmd.insert(-1, outFrag)
            cmd.append(inFrag)
            useShell = False

        # create thread
        threads.append(CommandThread(cmd, outFrag, shell=useShell))

    # start jobs
    for thread in threads:
        thread.start()

    # Do we need to look up descriptions
    if options.format in ("gene", "liz"):
        if isinstance(options.idMap, bool):
            # Default behaviour, check for DB.ids file
            idMapPath = dbfile + ".ids"
            if os.path.exists(idMapPath):
                options.idMap = idMapPath
            else:
                options.idMap = None

        # If user supplied map file or we found one:
        if options.idMap is not None:
            idToDescriptionMap = parseMapFile(options.idMap, delim="\t")
            # lookup and save column indices
            hitColumnIndex = getHitCol(options.format)
            hitDesColIndex = getHitCol(options.format, useDesc=True)
    else:
        options.idMap = None

    # wait and collect output
    exitcode = 0
    output = None
    for thread in threads:
        thread.join()

        # when processing first thread, we'll need to create output file
        if output is None:
            if outfile is None:
                output = sys.stdout
            else:
                output = open(outfile, "w")

        # Check thread status
        if thread.exitcode != 0:
            if thread.shell:
                logging.error("Command '%s' returned %s" % (thread.cmd, thread.exitcode))
            else:
                logging.error("Command '%s' returned %s" % (formatCommand(thread.cmd), thread.exitcode))
            exitcode = thread.exitcode
        else:
            logging.info("Thread %s completed!" % (str(thread)))

            # Handle output
            threadstream = open(thread.outfile)
            if options.idMap is None:
                # just copy
                for line in threadstream:
                    output.write(line)
            else:
                logging.debug("options.idMap: %s" % options.idMap)
                # insert descriptions
                for line in threadstream:
                    cells = line.split("\t")
                    hitId = cells[hitColumnIndex]
                    cells[hitDesColIndex] = idToDescriptionMap.get(hitId, "NA")
                    output.write("\t".join(cells))
            threadstream.close()

    output.close()

    if options.verbose <= 1:
        shutil.rmtree(localdir)

    sys.exit(exitcode)
def main():
    usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES"
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None})
    parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append',
                      help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.")
    parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.")
    addUniversalOptions(parser)
    parser.add_option('-t','--topHitPct', default=0, type='float',
                      help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.')
    parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append",
                      help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.")
    parser.add_option('-r','--reads', default=False, action="store_true",
                      help="Output just read names. By default, print the relevant hit lines for each read")

    (options, args) = parser.parse_args()

    if options.about:
        print description
        exit(0)

    # check args
    setupLogging(options,description)
    if options.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if options.taxdir is not None:
        taxonomy = readTaxonomy(options.taxdir, namesMap=True)
    else:
        taxonomy = None

    group1Map=getGroupMap(options.group1,taxonomy)
    group2Map=getGroupMap(options.group2,taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False)))
    if group2Map is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False)))

    # map reads to hits
    if options.parseStyle==GIS:
        keyType=int
    else:
        keyType=None
    accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE=parsingREs.get(options.parseStyle,None)
    if options.parseStyle == ORGS:
        getTaxid=_getOrgTaxid
    elif options.parseStyle == HITID:
        getTaxid=_getHitidTaxid
    elif options.parseStyle == HITDESC:
        getTaxid=_getHitdescTaxid
    else:
        getTaxid=_getExprTaxid

    # for filtering:
    filterParams = FilterParams.createFromOptions(options)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle,outhandle) in inputIterator(args,options):
        readCount=0
        goodReadCount=0
        printCount=0

        # parse file
        for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount+=1
            bestScore=0
            hitTaxids={}
            for hit in hits:
                score=hit.score
                taxids=[]
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit,accToTaxMap,taxonomy):
                    if taxid is None:
                        break
                    if group2Map is not None and not group2Map.get(taxid,False):
                        break
                    taxids.append(taxid)
                if len(taxids)==0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit]=taxids

                # find the top score
                if score>bestScore:
                    bestScore=score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore))
                all=True
                recognized=[]
                for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct):
                    if _anyTaxidInGroup(taxids,group1Map):
                        logging.debug("%s (%r)  is in group 1" % (hit,taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit,taxids))
                        all=False
                if len(recognized)==0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not options.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount+=1
                if options.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized),read))
                    for hit in sorted(recognized,key=lambda h: (h.score,h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount+=1

        if options.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount,readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))
def main():
    usage = "usage: %prog [OPTIONS] INPUT_FILE(S)"
    description = """
Takes an m8 blast and assigns each read to a pathway or gene family. Blast may be specified with -i or piped to STDIN.
    """
    parser = OptionParser(usage, description=description)
    parser.add_option("-i", "--inputfile", dest="infile",
                      metavar="INFILE", help="Read data table from INFILE"),
    addIOOptions(parser)
    parser.add_option('-O', "--outputStyle", default="cols",
                      choices=['cols','lines','python'],
                      help="How are multiple assignments displayed in output. By default ('cols'), multiple hits show up in multiple columns. The 'lines' option prints out a new line for each assignment. The 'python' option prints each assignment as a python string (in quotes) or a list of strings (in quotes, separted by commas, surrounded bya  pair of sqaure brackets).")
    parser.add_option("-m", "--mapFile", dest="mapFile",
                      metavar="MAPFILE", help="Location of file containing table of with db hit name as first column and geneIDs (Knumber) in second column.")
    parser.add_option("-M", "--mapStyle", default='auto', choices=['auto','kegg','tab'],
                      help="What type of mapping file are you using: simple tab separated list of IDs and kos, or the genes_ko.list file from KEGG (which adds ko: to the K numbers and can have multiple records for each gene id). By default, this script will inspect the file name and guess, but you can force either 'kegg' or 'tab' with this option.")
    parser.add_option("-p", "--parseStyle",
                      default=KEGG,
                      choices=[ACCS,GIS,KEGG,HITID,HITDESC],
                      help="What should be parsed from the hit table: accessions('accs'), 'gis', K numbers in description ('kegg'), the full hit name('hitid'), or the full hit description('hitdesc'). (defaults to '%default')")
    parser.add_option("-c", "--cutoff", dest="cutoff", type="float", default=0.01,
            help="Cutoff for showing paths or genes. If a fractional count for a path/gene is below this value, it will be labelled None.",
                  metavar="CUTOFF")

    # format and filterPct
    addHitTableOptions(parser)

    parser.add_option("-C", "--countMethod", dest="countMethod", default="all", choices=('first','most','all','consensus'),
                      help="How to deal with assignments from multiple hits. (first, most: can return multiple hits, all (default): return every hit, consensus: return None unless all the same)",
                    metavar="COUNTMETHOD")
    parser.add_option("-r","--filterForKO",action="store_true", dest="koHitsOnly", default=False, help="ignore hits with no KO assignment. This means reads with no hits to KO tagged sequences will not be in the output.")
    parser.add_option("-l","--level", dest="level", default="ko", choices=('ko','NAME','DEFINITION','EC','PATHWAY','1','2','3'), help="Either 'ko'; a string to look for in ko file ('PATHWAY','NAME', 'DEFINITION', or 'EC'); or level in kegg class heirarchy (1, 2, or 3 (should be same as PATHWAY))")
    parser.add_option("-k", "--koFile", dest="ko", metavar="KOFILE", default=None,
                      help="File containing kegg heirarchy (either ko or ko00001.keg)")
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if options.infile is None:
        infile = sys.stdin
    else:
        infile = open(options.infile)

    if options.parseStyle==KEGG:
        if options.mapFile is not None:
            logging.warn("Do you REALLY want to apply a mapping to KOs?")

    if options.level != 'ko':
        if options.ko is None:
            options.error("Please supply KEGG file if sepcifying a level other than 'ko' ")

        # read KEGG file
        koTranslation = readKEGGFile(options.ko, options.level)
    else:
        koTranslation = None

    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle=='kegg' or ( options.mapStyle=='auto' and len(options.mapFile)>=13 and options.mapFile[-13:]=='genes_ko.list'):
            valueMap=parseLinkFile(options.mapFile)
        else:
            if options.parseStyle == GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
    else:
        valueMap=None

    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMap = parseM8File(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.koHitsOnly,sortReads=options.hitTableSortReads)

        # manipulate mappings
        hitMap = applySimpleCutoff(hitMap, options.cutoff, koTranslation)

        log("maps complete for %d reads" % (len(hitMap)))

        # print out hit table
        outhandle.write("Read\tHit\n")
        if options.outputStyle=='python':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                outhandle.write("\t")
                outhandle.write(repr(hit))
                outhandle.write("\n")
        if options.outputStyle=='lines':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write(str(read))
                        outhandle.write("\t")
                        outhandle.write(str(h))
                        outhandle.write("\n")
                else:
                    outhandle.write(str(read))
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                    outhandle.write("\n")
        else:
             for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write("\t")
                        outhandle.write(str(h))
                else:
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                outhandle.write("\n")
Example #17
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-S',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
        "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug("Reading from %s and writing to %s" %
                      (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            hits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write("%s\t%s\n" %
                                    (read, "\t".join(assignmentList)))
Example #18
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files",
                        nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o",
                        "--outfile",
                        dest="output_file",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={
            'countMethod': ('tophit', 'first', 'most', 'all', 'consensus')
        },
        helps={
            'countMethod':
            ("How to deal with counts from multiple hits. ('first': "
             "just use the first hit, 'most': "
             "can return multiple hits, 'all': return every hit, "
             "consensus: return None unless all the same). Do not "
             "use most or consensus with more than one level at a time. "
             "Default is 'tophit': This breaks any ties by choosing "
             "the most abundant hit based on other unambiguous "
             "assignments.")
        })

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s", len(valueMap),
                         next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        params = FilterParams.create_from_arguments(arguments)
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug("READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit,
                          gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      arguments.hitTableFormat,
                                      arguments.filter_top_pct,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info("parsed %d hits (%d unique) for %d reads from %s",
                         total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None}
    )
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action="append",
        help="Taxon to identify reads in. Top hits (as defined by "
        "--topHitPct) must be in this group. It can be a taxid, "
        "a name, or a file listing taxids. Use multiple times to "
        "specify a list of organisms. Use -a to specify whether "
        "all or at least one of the top hits must match.",
    )
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
        "in the target taxon/taxa. By default, all top hits must be "
        "in the target group.",
    )
    parser.add_argument(
        "-t",
        "--topHitPct",
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
        "to qualify as a top hit. Default is 0, ie must have the best "
        "score. Use 100 to get all hits.",
    )
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
        "these hits) must be in the target group or this group. Again, "
        "it can be a taxid, a name, or a file listing taxids. "
        "It can also be inkoved multiple times to choose multiple "
        "groups.",
    )
    parser.add_argument(
        "-r",
        "--reads",
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit " "lines for each read",
    )

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write("\n")
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized), read))
                    for hit in sorted(recognized, key=lambda h: (h.score, h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
Example #20
0
def main():
    # set up CLI
    description = __doc__

    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("-i", "--infile", dest="infile",
                        metavar="FILE", help="Read raw table from INFILE")
    parser.add_argument(
        "-o",
        "--outfile",
        dest="outfile",
        metavar="OUTFILE",
        help="Write collapsed table to OUTFILE")
    parser.add_argument("-d", "--delim", dest="delim", default="\t",
                        help="Input table delimiter", metavar="DELIM")
    parser.add_argument("-D", "--delimOut", dest="delimOut", default="\t",
                        help="Output table delimiter", metavar="DELIM")
    parser.add_argument(
        '-F',
        '--countFirst',
        action='store_true',
        default=False,
        help="Don't skip the first line, it's NOT a header")
    parser.add_argument(
        "-R",
        "--readColumn",
        dest="readCol",
        type=int,
        default=0,
        help="Index (starting at 0) of column with read name, 0 is default",
        metavar="READCOL")
    parser.add_argument(
        "-H",
        "--hitColumn",
        dest="hitCol",
        type=int,
        default=2,
        help="Index (starting at 0) of column with hit name (for counting), "
             "2 is default, if less than zero, all (non-read) columns will "
             "be used as multiple hits",
        metavar="HITCOL")
    parser.add_argument(
        '-s',
        '--hitSep',
        default=None,
        help="Use this string to split multiple values in single hit cell. "
             "Default is 'None' to leave hits as is, use 'eval' to parse "
             "as python repr strings")
    add_weight_arguments(parser, multiple=False)
    parser.add_argument("-T", "--total", default=False, action="store_true",
                        help="Report 'Total' in the first row")

    # cutoff options
    add_count_arguments(parser, {'cutoff': 0})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # make sure we have something to do
    if (arguments.infile is None):
        logging.info("Reading table from: STDIN")
    else:
        logging.info("Reading table from: " + arguments.infile)

    if (arguments.outfile is None):
        logging.info("Writing counts to: STDOUT")
    else:
        logging.info("Writing counts to: " + arguments.outfile)

    # process arguments
    takeFirst = (arguments.allMethod == 'first')
    splitHits = (arguments.hitSep is not None and arguments.hitSep != 'None')
    uncluster = (arguments.weights is not None)

    if arguments.hitSep == 'eval':
        parser.error("Sorry, parsing with eval is not yet supported!")

    # inform the curious user
    logging.info("Delimiter: '" + arguments.delim)
    logging.info("Read names in col: '" + str(arguments.readCol))
    logging.info("Hit names in col: '" + str(arguments.hitCol))
    if splitHits:
        logging.info("Splitting hits with: %s" % (arguments.hitSep))
        logging.warn(
            "Splitting hits has not been tested yet! Let me know how it goes.")
    if takeFirst:
        logging.info("Taking first hit for each read.")
    else:
        if arguments.allMethod == 'portion':
            logging.info("Dividing count among all hits for each read.")
        else:
            logging.info("Adding 1 to every hit for each read")
    if uncluster:
        logging.info(
            "Getting read cluster sizes from: %s" %
            (arguments.weights))
    if arguments.countFirst:
        logging.info("First line is data")
    else:
        logging.info("Skipping first line")

    # Do the counting!
    counts = {}
    countHitsForRead = getAllMethod(arguments.allMethod)

    clusteredReadCounts = {}
    if uncluster:
        clusteredReadCounts = parseMapFile(
            arguments.clusterFile, valueType=int)

    currentRead = ''
    readCount = 1
    hits = []

    if arguments.infile is None:
        infile = sys.stdin
    else:
        infile = open(arguments.infile)

    # loop over lines
    if not arguments.countFirst:
        # skip first line
        try:
            next(infile)
        except StopIteration:
            raise Exception("No lines in %s" % str(infile))

    for line in infile:
        line = line.rstrip('\r\n')
        rowcells = line.split(arguments.delim)
        # get read
        read = rowcells[arguments.readCol]

        # if it's a new read, process previous read
        if currentRead == '':
            currentRead = read
        elif read != currentRead and currentRead != '':
            readCount += 1
            logging.info("Checking hits for %s" % currentRead)

            # was it part of a cluster?
            multiplier = 1
            if uncluster:
                multiplier = clusteredReadCounts[currentRead]

            # where does the count for this read go
            countHitsForRead(hits, counts, multiplier=multiplier)

            hits = []
            currentRead = read

        # get hit from this line
        if arguments.hitCol >= 0:
            hit = rowcells[arguments.hitCol]
            if splitHits:
                hits.extend(hit.split(arguments.hitSep))
            else:
                hits.append(hit)
        else:
            rowcells.pop(arguments.readCol)
            hits.extend(rowcells)

    # check last read!
    logging.info("Checking hits for %s" % currentRead)
    # was it part of a cluster?
    multiplier = 1
    if uncluster:
        multiplier = clusteredReadCounts[currentRead]
    # where does the count for this read go
    countHitsForRead(hits, counts, multiplier=multiplier)

    # apply cutoff
    if arguments.cutoff > 0:
        applyFractionalCutoff(counts, threshold=arguments.cutoff * readCount)

    # print output
    if arguments.outfile is None:
        outhandle = sys.stdout
    else:
        outhandle = open(arguments.outfile, 'w')

    if arguments.total:
        outhandle.write("Total%s%d\n" % (arguments.delimOut, readCount))

    if arguments.allMethod == 'portion':
        outFmtString = "%s%s%f\n"
    else:
        outFmtString = "%s%s%d\n"

    delimRE = re.compile(arguments.delimOut)
    for hit in sorted(counts.keys()):
        count = counts[hit]
        hit = delimRE.sub('_', hit)
        outhandle.write(outFmtString % (hit, arguments.delimOut, count))