Python parseM8FileIter Beispiele

Programmiersprache: Python

Namespace / Paketname: edl.hits

Methode / Funktion: parseM8FileIter

Beispiele auf hotexamples.com: 10

Python parseM8FileIter - 10 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die edl.hits.parseM8FileIter, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: hittables.py Projekt: jmeppley/py-metagenomics

def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(
            taxonomy, namesMap=kwargs.pop(
                'namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts

Beispiel #2

Datei anzeigen

Datei: hittables.py Projekt: jmeppley/py-metagenomics

def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(taxonomy,
                                namesMap=kwargs.pop('namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts

Beispiel #3

Datei anzeigen

Datei: count_taxa.py Projekt: jmeppley/py-metagenomics

def main():
    """" Set up the CLI """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_files", nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o", "--outfile", dest="outfile",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-r", "--rank", dest="ranks", default=None,
                        metavar="RANK", action="append",
                        help=""" Rank(s) to collect counts on. Use flag
                        multiple
                        times to specify multiple ranks. If multiple values
                        given, one table produced for each with rank name
                        appended to file name. Defaults to all major ranks
                        between phylum and species. Corresponds to rank names
                        in nodes.dmp. To see list run:
                        'cut -f5 nodes.dmp | uniq | sort | uniq'
                        in ncbi tax dir. Will also accept 'organism' to mean
                        no rank (ie, just the organism name).""")
    parser.add_argument(
        "-s",
        "--collapseToDomain",
        default=False,
        action="store_true",
        help="Collapse all taxa below given rank down to "
             "superkingdom/domain. EG: in the genus output, anything "
             "assigned to Cyanobactia, will be lumped in with all "
             "other bacteria")
    parser.add_argument(
            "--proportional",
            dest="proportional",
            default=False,
            action="store_true",
            help="""When using tophit or toporg, redistribute proportionally
            instead of winner take all""")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
             "Will be ignored if beyond the rank of the taxa "
             "(IE We can't include species if the taxon being counted "
             "is genus)")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, tax dir, and more
    add_taxon_arguments(
        parser,
        choices={
            'countMethod': (
                'LCA',
                'all',
                'first',
                'most',
                'tophit',
                'toporg',
                'consensus')})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if arguments.proportional and \
            arguments.countMethod not in ['tophit', 'toporg']:
        parser.error("--proportinal only has meaning "
                     "if using tophit or toporg")

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Handle the case where Galaxy tries to set None as a string
    arguments.ranks = checkNoneOption(arguments.ranks)
    arguments.printRanks = checkNoneOption(arguments.printRanks)

    logging.info("Printing out ranks: %r", arguments.ranks)

    # Set defaults and check for some conflicts
    if arguments.ranks is None and arguments.taxdir is None:
        # using hit names only
        arguments.ranks = [ORG_RANK]
        if arguments.printRanks is not None:
            parser.error("Display ranks are not used without taxonomic info")
    else:
        if arguments.taxdir is None:
            parser.error("Cannot select ranks without a taxonomy")
        if arguments.ranks is None:
            # set a default
            arguments.ranks = [
                'phylum',
                'class',
                'order',
                'family',
                'genus',
                'species']

        try:
            # Make sure the rank lists make sense
            arguments.ranks = cleanRanks(arguments.ranks)
            if arguments.printRanks is not None:
                arguments.printRanks = cleanRanks(arguments.printRanks)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single rank
    if len(arguments.ranks) > 1 and arguments.outfile is None:
        parser.error("STDOUT only works if a single rank is chosen!")

    # Because rank is used in parsing hits, we can only do multiple ranks for
    # certain kinds of count methods
    if len(arguments.ranks) > 1:
        rank = None
        if arguments.countMethod in ['consensus', 'most']:
            parser.error(
                "Using multiple ranks does not work with the 'consensus' "
                "or 'most' counting methods. LCA should give the same "
                "results as consensus. If you really want to do this, "
                "use a bash loop:'for rank in phylum order genus; do "
                "COMMAND -r ${rank}; done'")
    else:
        rank = arguments.ranks[0]

    # load necessary maps
    (taxonomy, hitStringMap) = readMaps(arguments)

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        multifile = redistribute.multipleFileWrapper(fileLabels.keys())

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=not arguments.proportional,
                parseStyle=arguments.parseStyle,
                sequenceWeights=sequenceWeights)
            # define method to turn Hits into orgnaisms
            hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                             taxonomy=taxonomy,
                                             hitStringMap=hitStringMap)

            translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0]

        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                returnTranslations=True,
                winnerTakeAll=not arguments.proportional,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=ACCS)

            # Organisms will be returned, make translator trivial:
            translateHit = passThrough

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_name, read_name = read_name.split("/", 1)
            file_tag = fileLabels[unquote_plus(file_name)]
            taxon = translateHit(hit)
            taxcount = fileCounts[file_tag].setdefault(taxon, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][taxon] = taxcount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      hitStringMap,
                                      params,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      taxonomy=taxonomy,
                                      rank=rank)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s",
                total, len(counts), len(hitMap), filename)

            infile.close()

    printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)

Beispiel #4

Datei anzeigen

Datei: count_paths.py Projekt: jmeppley/py-metagenomics

def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files", nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o", "--outfile", dest="output_file",
                        metavar="OUTFILE", help="Write count table to OUTFILE")
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={'countMethod':
                 ('tophit',
                  'first',
                  'most',
                  'all',
                  'consensus')},
        helps={'countMethod':
               ("How to deal with counts from multiple hits. ('first': "
                "just use the first hit, 'most': "
                "can return multiple hits, 'all': return every hit, "
                "consensus: return None unless all the same). Do not "
                "use most or consensus with more than one level at a time. "
                "Default is 'tophit': This breaks any ties by choosing "
                "the most abundant hit based on other unambiguous "
                "assignments.")})

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                valueDelim=arguments.tab_map_delim,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s",
                         len(valueMap), next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    params = FilterParams.create_from_arguments(arguments)
    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug(
                "READ: %s\t%s\t%s\t%s",
                file_tag, read_name, hit.hit, gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      params,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s",
                total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)

Beispiel #5

Datei anzeigen

def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-s',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
             "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s" %
            (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            arguments.hitTableFormat,
            arguments.filterTopPct,
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write(
                        "%s\t%s\n" %
                        (read, "\t".join(assignmentList)))

Beispiel #6

Datei anzeigen

Datei: assignPaths.py Projekt: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]"
    description = """
Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level.
    This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines.
    This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3.
    If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. 
    NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). 
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    parser.add_option("-l", "--level", dest="levels", default=None,
                      metavar="LEVEL", action="append",
                      help=""" Level(s) to collect counts on. Use flag 
                      multiple times to specify multiple levels. If multiple 
                      values given, one table produced for each with rank 
                      name appended to file name. Levels can be an integer 
                      (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 
                      'ko', or 'ortholog' (which are all synonyms), or  
                      anything not synonymous with 'gene' to 
                      get CAZy groups. Defaults to ortholog/role and 
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_option('-s','--squash',dest='splitForLevels',
            default=True, action='store_false',
            help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.addPathOptions(parser)

    # log level and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    # Set defaults and check for some conflicts
    if options.levels is None and options.heirarchyFile is None:
        # using hit names only
        options.levels=[None]
    else:
        if options.heirarchyFile is None and options.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (options.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if options.levels is None:
            # set a default
            if options.heirarchyType is 'kegg':
                options.levels=['ko','1','2','pathway']
            if options.heirarchyType is 'seed':
                options.levels=['role','1','2','subsystem']
            else:
                options.levels=['gene','group']

        try:
            # Make sure the level list makes sense
            options.levels=cleanLevels(options.levels)
        except Exception as e:
            parser.error(str(e))

    # only print to stdout if there is a single input file
    if len(args)>1 and options.outfile is None:
        parser.error("STDOUT only works if a single input file is given!")


    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle == 'auto':
            with open(options.mapFile) as f:
                firstLine=f.next()
                while len(firstLine)==0 or firstLine[0]=='#':
                    firstLine=f.next()
            if koMapRE.search(firstLine):
                options.mapStyle='kegg'
            elif seedMapRE.search(firstLine):
                options.mapStyle='seed'
            elif tabMapRE.search(firstLine):
                options.mapStyle='tab'
            #elif cogMapRE.search(firstLine):
            #    options.mapStyle='cog'
            else:
                raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine))

        logging.info("Map file seems to be: %s" % (options.mapStyle))
        if options.mapStyle=='kegg':
            valueMap=kegg.parseLinkFile(options.mapFile)
        elif options.mapStyle=='seed':
            valueMap=kegg.parseSeedMap(options.mapFile)
        #elif options.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(options.mapFile)
        else:
            if options.parseStyle == hits.GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
        if len(valueMap)>0:
            logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next()))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap=None

    # set up level mapping
    levelMappers = [getLevelMapper(l,options) for l in options.levels]

    # parse input files
    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads)

        outhandle.write("Read\t%s\n" % ('\t'.join(options.levels)))
        for read, hitIter in hitMapIter:
            assignments=[]
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment=[]
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(assignment,options):
                    outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))

Beispiel #7

Datei anzeigen

def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-S',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
        "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug("Reading from %s and writing to %s" %
                      (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            hits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write("%s\t%s\n" %
                                    (read, "\t".join(assignmentList)))

Beispiel #8

Datei anzeigen

def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("input_files",
                        nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o",
                        "--outfile",
                        dest="output_file",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(
        parser,
        defaults={'countMethod': 'tophit'},
        choices={
            'countMethod': ('tophit', 'first', 'most', 'all', 'consensus')
        },
        helps={
            'countMethod':
            ("How to deal with counts from multiple hits. ('first': "
             "just use the first hit, 'most': "
             "can return multiple hits, 'all': return every hit, "
             "consensus: return None unless all the same). Do not "
             "use most or consensus with more than one level at a time. "
             "Default is 'tophit': This breaks any ties by choosing "
             "the most abundant hit based on other unambiguous "
             "assignments.")
        })

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warning("Type: %s", arguments.heirarchyType)
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the rank lists make sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single level
    if len(arguments.levels) > 1 and arguments.output_file is None:
        parser.error("STDOUT only works if a single level is chosen!")

    cutoff = arguments.cutoff

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            # elif cogMapRE.search(firstLine):
            #    arguments.mapStyle='cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s", arguments.mapStyle)
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        # elif arguments.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s", len(valueMap),
                         next(iter(valueMap.items())))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    # TODO: incorporate weights into tophit algorithm!
    if arguments.countMethod == 'tophit':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        params = FilterParams.create_from_arguments(arguments)
        multifile = redistribute.multipleFileWrapper(fileLabels.items())

        # don't give any hit translation, just use hit ids for redistribution
        readHits = redistribute.pickBestHitByAbundance(
            multifile,
            filterParams=params,
            returnLines=False,
            winnerTakeAll=True,
            parseStyle=arguments.parseStyle,
            sequenceWeights=sequenceWeights)
        # define method to turn Hits into Genes (kos, families)
        hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                         hitStringMap=valueMap)
        # translateHit = lambda hit: hitTranslator.translateHit(hit)[0]

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_tag, read_name = read_name.split("/", 1)
            file_tag = unquote_plus(file_tag)
            gene = hitTranslator.translateHit(hit)[0]
            if gene is None:
                gene = "None"
            logging.debug("READ: %s\t%s\t%s\t%s", file_tag, read_name, hit.hit,
                          gene)
            genecount = fileCounts[file_tag].setdefault(gene, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][gene] = genecount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      valueMap,
                                      arguments.hitTableFormat,
                                      arguments.filter_top_pct,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      ignoreEmptyHits=arguments.mappedHitsOnly)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info("parsed %d hits (%d unique) for %d reads from %s",
                         total, len(counts), len(hitMap), filename)

            infile.close()

    logging.debug(repr(fileCounts))
    printCountTablesByLevel(fileCounts, totals, sortedLabels, arguments)

Beispiel #9

Datei anzeigen

Datei: count_taxa.py Projekt: Piplopp/py-metagenomics

def main():
    """" Set up the CLI """
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("input_files",
                        nargs="+",
                        default=[],
                        metavar="INFILE",
                        help="List of hit tables to process")
    parser.add_argument("-o",
                        "--outfile",
                        dest="outfile",
                        metavar="OUTFILE",
                        help="Write count table to OUTFILE")
    parser.add_argument("-r",
                        "--rank",
                        dest="ranks",
                        default=None,
                        metavar="RANK",
                        action="append",
                        help=""" Rank(s) to collect counts on. Use flag
                        multiple
                        times to specify multiple ranks. If multiple values
                        given, one table produced for each with rank name
                        appended to file name. Defaults to all major ranks
                        between phylum and species. Corresponds to rank names
                        in nodes.dmp. To see list run:
                        'cut -f5 nodes.dmp | uniq | sort | uniq'
                        in ncbi tax dir. Will also accept 'organism' to mean
                        no rank (ie, just the organism name).""")
    parser.add_argument(
        "-s",
        "--collapseToDomain",
        default=False,
        action="store_true",
        help="Collapse all taxa below given rank down to "
        "superkingdom/domain. EG: in the genus output, anything "
        "assigned to Cyanobactia, will be lumped in with all "
        "other bacteria")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
        "Will be ignored if beyond the rank of the taxa "
        "(IE We can't include species if the taxon being counted "
        "is genus)")

    # option for deconvoluting clusters or assemblies
    add_weight_arguments(parser, multiple=True)

    # cutoff options
    add_count_arguments(parser)

    # format, tax dir, and more
    add_taxon_arguments(parser,
                        choices={
                            'countMethod': ('LCA', 'all', 'first', 'most',
                                            'tophit', 'toporg', 'consensus')
                        })

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    if len(arguments.input_files) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Handle the case where Galaxy tries to set None as a string
    arguments.ranks = checkNoneOption(arguments.ranks)
    arguments.printRanks = checkNoneOption(arguments.printRanks)

    logging.info("Printing out ranks: %r", arguments.ranks)

    # Set defaults and check for some conflicts
    if arguments.ranks is None and arguments.taxdir is None:
        # using hit names only
        arguments.ranks = [ORG_RANK]
        if arguments.printRanks is not None:
            parser.error("Display ranks are not used without taxonomic info")
    else:
        if arguments.taxdir is None:
            parser.error("Cannot select ranks without a taxonomy")
        if arguments.ranks is None:
            # set a default
            arguments.ranks = [
                'phylum', 'class', 'order', 'family', 'genus', 'species'
            ]

        try:
            # Make sure the rank lists make sense
            arguments.ranks = cleanRanks(arguments.ranks)
            if arguments.printRanks is not None:
                arguments.printRanks = cleanRanks(arguments.printRanks)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(arguments.weights)

    # only print to stdout if there is a single rank
    if len(arguments.ranks) > 1 and arguments.outfile is None:
        parser.error("STDOUT only works if a single rank is chosen!")

    # Because rank is used in parsing hits, we can only do multiple ranks for
    # certain kinds of count methods
    if len(arguments.ranks) > 1:
        rank = None
        if arguments.countMethod in ['consensus', 'most']:
            parser.error(
                "Using multiple ranks does not work with the 'consensus' "
                "or 'most' counting methods. LCA should give the same "
                "results as consensus. If you really want to do this, "
                "use a bash loop:'for rank in phylum order genus; do "
                "COMMAND -r ${rank}; done'")
    else:
        rank = arguments.ranks[0]

    # load necessary maps
    (taxonomy, hitStringMap) = readMaps(arguments)

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in arguments.input_files:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    if arguments.countMethod == 'tophit' or arguments.countMethod == 'toporg':
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute
        params = FilterParams.create_from_arguments(arguments)
        multifile = redistribute.multipleFileWrapper(fileLabels.keys())

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=True,
                parseStyle=arguments.parseStyle,
                sequenceWeights=sequenceWeights)
            # define method to turn Hits into orgnaisms
            hitTranslator = getHitTranslator(parseStyle=arguments.parseStyle,
                                             taxonomy=taxonomy,
                                             hitStringMap=hitStringMap)

            translateHit = lambda hit: hitTranslator.translateHit(hit=hit)[0]

        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                returnTranslations=True,
                winnerTakeAll=True,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=ACCS)

            # Organisms will be returned, make translator trivial:
            translateHit = passThrough

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read_name, hit) in readHits:
            file_name, read_name = read_name.split("/", 1)
            file_tag = fileLabels[unquote_plus(file_name)]
            taxon = translateHit(hit)
            taxcount = fileCounts[file_tag].setdefault(taxon, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read_name, 1)
            fileCounts[file_tag][taxon] = taxcount + increment
            totals[file_tag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.items():
            infile = open(filename, 'rU')

            hitIter = parseM8FileIter(infile,
                                      hitStringMap,
                                      arguments.hitTableFormat,
                                      arguments.filter_top_pct,
                                      arguments.parseStyle,
                                      arguments.countMethod,
                                      taxonomy=taxonomy,
                                      rank=rank)

            (total, counts, hitMap) = \
                countIterHits(hitIter,
                              allMethod=arguments.allMethod,
                              weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info("parsed %d hits (%d unique) for %d reads from %s",
                         total, len(counts), len(hitMap), filename)

            infile.close()

    printCountTablesByRank(fileCounts, totals, sortedLabels, arguments)

Beispiel #10

Datei anzeigen

Datei: assign_taxa.py Projekt: jmeppley/py-metagenomics

def main():
    """ The CLI """
    description = """
Takes a hit table (reads searched against a database) and assigns
each read to a taxon. Hit table may be specified with -i or piped to STDIN.

    Notes:

     * Specifying a top score precent (-F) will force hits to be sorted
       by score within each read. However, it is assumed that the hits in
       the input table(s) are already grouped by read. This program does
       not attempt to sort the entire input.
    """
    parser = argparse.ArgumentParser(description)
    util.add_IO_arguments(parser)
    parser.add_argument("-T", "--taxids", default=False, action="store_true",
                        help="Output taxids instead of names")
    edlhits.add_taxon_arguments(parser)
    parser.add_argument(
        "-r",
        "--rank",
        dest="rank",
        default=None,
        metavar="RANK",
        help=" Rank to collect counts on.  Defaults to None (whatever "
             "the annotation was). Corresponds to rank names in nodes.dmp. "
             "To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in "
             "ncbi tax dir")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
             "Will be ignored if beyond the rank of the taxa "
             "(IE We can't include species if the taxon being counted "
             "is genus)")
    parser.add_argument(
        "--no-header",
        dest="no_header",
        default=False,
        action='store_true',
        help="do not write header line")

    util.add_universal_arguments(parser)
    arguments = parser.parse_args()
    util.setup_logging(arguments)

    logging.debug("Parsing style is: %s", arguments.parseStyle)

    # Handle the case where Galaxy tries to set None as a string
    arguments.printRanks = util.checkNoneOption(arguments.printRanks)

    # check arguments
    if arguments.taxids and arguments.taxdir is None:
        parser.error("Only use -T when a taxonomy is specified")
    if arguments.rank is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-r).")
    if arguments.printRanks is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-R).")
    if arguments.rank is not None:
        if arguments.rank == 'domain':
            logging.warning('translating domain to superkingdom')
            arguments.rank = 'superkingdom'
        if arguments.rank not in ranks:
            parser.error("Unknown rank: %s" % (arguments.rank))

    try:
        # Make sure the rank lists make sense
        if arguments.printRanks is not None:
            arguments.printRanks = cleanRanks(arguments.printRanks)
    except Exception as exc:
        parser.error(str(exc))

    # load necessary maps
    (taxonomy, value_map) = edlhits.readMaps(arguments)

    # loop over inputs
    for (inhandle, outhandle) in util.inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s",
            inhandle, outhandle)
        hit_iter = edlhits.parseM8FileIter(
            inhandle,
            value_map,
            edlhits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            taxonomy=taxonomy,
            rank=arguments.rank)

        ##
        # print output
        # choose output method
        if arguments.taxids:
            hit_header = 'taxid'
            printer = taxid_printer
        else:
            if arguments.printRanks is None:
                hit_header = 'Hit(s)'
                printer = default_printer
            else:
                hit_header = '\t'.join(arguments.printRanks)

                def printer(read, hits):
                    " Inline function to reduce number of arguments "
                    return tax_table_printer(read,
                                             hits,
                                             arguments.rank,
                                             arguments.printRanks)

        # loop over reads
        if not arguments.no_header:
            outhandle.write("Read\t{}\n".format(hit_header))
        for (read, hits) in hit_iter:
            outhandle.write(printer(read, hits))