コード例 #1
0
def main():
    usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES"
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None})
    parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append',
                      help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.")
    parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.")
    addUniversalOptions(parser)
    parser.add_option('-t','--topHitPct', default=0, type='float',
                      help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.')
    parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append",
                      help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.")
    parser.add_option('-r','--reads', default=False, action="store_true",
                      help="Output just read names. By default, print the relevant hit lines for each read")

    (options, args) = parser.parse_args()

    if options.about:
        print description
        exit(0)

    # check args
    setupLogging(options,description)
    if options.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if options.taxdir is not None:
        taxonomy = readTaxonomy(options.taxdir, namesMap=True)
    else:
        taxonomy = None

    group1Map=getGroupMap(options.group1,taxonomy)
    group2Map=getGroupMap(options.group2,taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False)))
    if group2Map is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False)))

    # map reads to hits
    if options.parseStyle==GIS:
        keyType=int
    else:
        keyType=None
    accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE=parsingREs.get(options.parseStyle,None)
    if options.parseStyle == ORGS:
        getTaxid=_getOrgTaxid
    elif options.parseStyle == HITID:
        getTaxid=_getHitidTaxid
    elif options.parseStyle == HITDESC:
        getTaxid=_getHitdescTaxid
    else:
        getTaxid=_getExprTaxid

    # for filtering:
    filterParams = FilterParams.createFromOptions(options)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle,outhandle) in inputIterator(args,options):
        readCount=0
        goodReadCount=0
        printCount=0

        # parse file
        for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount+=1
            bestScore=0
            hitTaxids={}
            for hit in hits:
                score=hit.score
                taxids=[]
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit,accToTaxMap,taxonomy):
                    if taxid is None:
                        break
                    if group2Map is not None and not group2Map.get(taxid,False):
                        break
                    taxids.append(taxid)
                if len(taxids)==0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit]=taxids

                # find the top score
                if score>bestScore:
                    bestScore=score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore))
                all=True
                recognized=[]
                for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct):
                    if _anyTaxidInGroup(taxids,group1Map):
                        logging.debug("%s (%r)  is in group 1" % (hit,taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit,taxids))
                        all=False
                if len(recognized)==0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not options.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount+=1
                if options.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized),read))
                    for hit in sorted(recognized,key=lambda h: (h.score,h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount+=1

        if options.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount,readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))
コード例 #2
0
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser, defaults={"mapFile": None, "parseStyle": ACCS, "filterPct": -1, "countMethod": "all", "taxdir": None}
    )
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action="append",
        help="Taxon to identify reads in. Top hits (as defined by "
        "--topHitPct) must be in this group. It can be a taxid, "
        "a name, or a file listing taxids. Use multiple times to "
        "specify a list of organisms. Use -a to specify whether "
        "all or at least one of the top hits must match.",
    )
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
        "in the target taxon/taxa. By default, all top hits must be "
        "in the target group.",
    )
    parser.add_argument(
        "-t",
        "--topHitPct",
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
        "to qualify as a top hit. Default is 0, ie must have the best "
        "score. Use 100 to get all hits.",
    )
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
        "these hits) must be in the target group or this group. Again, "
        "it can be a taxid, a name, or a file listing taxids. "
        "It can also be inkoved multiple times to choose multiple "
        "groups.",
    )
    parser.add_argument(
        "-r",
        "--reads",
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit " "lines for each read",
    )

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(arguments.mapFile, valueType=int, keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (read, hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write("\n")
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized), read))
                    for hit in sorted(recognized, key=lambda h: (h.score, h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount, goodReadCount, readCount))
コード例 #3
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l", "--level", dest="levels", default=None,
                        metavar="LEVEL", action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-s',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
             "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(
                arguments.mapFile,
                valueType=None,
                keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(l, arguments) for l in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s" %
            (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            arguments.hitTableFormat,
            arguments.filterTopPct,
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write(
                        "%s\t%s\n" %
                        (read, "\t".join(assignmentList)))
コード例 #4
0
def main():
    # set up CLI
    description = """
    Takes a tabular text file and translate a column of KO values into a new
    column of KEGG pathways. KO column can have multiple entries per row.
    Output column will have multiple entries per pathway cell.
    """

    parser = argparse.ArgumentParser(description=description)
    util.add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="level",
                        default="PATHWAY",
                        metavar="LEVEL",
                        help=""" Level to collect counts on. Level can
                        be one of:
                          NAME, PATHWAY, EC, DEFINITION,
                          or a level in the CLASS heirachy: 1, 2, or 3
                        """)
    parser.add_argument(
        "-f",
        "--fill_missing",
        dest="fill",
        metavar="FILL",
        default="No Pathway",
        help="Put FILL in column when KO has no pathway assigned. "
        "Defaults to 'No Pathway'")
    # format, ortholog heirarchy, and more
    parser.add_argument("-k",
                        "--ko_file",
                        dest="ko_file",
                        metavar="MAPFILE",
                        help="Location of kegg ko file")
    parser.add_argument("-c",
                        "--ko_column",
                        type=int,
                        default=1,
                        help="Column number (first column is 1)",
                        metavar="COLUMN")
    parser.add_argument(
        "-C",
        "--new_column",
        type=int,
        default=None,
        help="Column number to insert new column after. Default is the "
        "after the source column. 0=>make it the first column. "
        "-1=>make it the last column.",
        metavar="COLUMN")
    parser.add_argument(
        "-L",
        "--long_output",
        default=False,
        action='store_true',
        help="Insert new duplicate rows if KO maps to multiple values")
    parser.add_argument(
        "-H",
        "--header",
        default=None,
        metavar='HEADER',
        help="Put HEADER in first row instead of trying to translate")
    parser.add_argument("-Q",
                        "--quotes",
                        default=False,
                        action="store_true",
                        help="Encase translated values in double quotes")
    parser.add_argument(
        "-s",
        "--sep",
        dest='sep',
        default='\t',
        help="""Character separating table cells. Default is tab""")
    parser.add_argument(
        "-S",
        "--ko_sep",
        dest='ko_sep',
        default=';',
        help="""Character separating multiple KO values in iput table
            and used to separate multiple values in output column.
            Default is ";". Ignored for output if --longOutput
            requested""")

    # log level and help
    util.add_universal_arguments(parser)
    arguments = parser.parse_args()
    util.setup_logging(arguments)

    logging.info("KO mapping from: " + arguments.ko_file)
    logging.debug("Fill: '%s'" % (arguments.fill))

    translation = kegg.parse_KEGG_file(arguments.ko_file, arguments.level)

    # switch to zero indexing
    if arguments.new_column:
        arguments.new_column -= 1
    arguments.ko_column -= 1

    for (inhandle, outhandle) in util.inputIterator(arguments):
        for new_line in translate_ko_column(
                inhandle,
                sep=arguments.sep,
                ko_sep=arguments.ko_sep,
                ko_column=arguments.ko_column,
                new_column=arguments.new_column,
                translation=translation,
                default=arguments.fill,
                quotes=arguments.quotes,
                header=arguments.header,
                long_out=arguments.long_output,
        ):
            outhandle.write(new_line)
コード例 #5
0
def main():
    usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]"
    description = """
Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level.
    This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines.
    This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3.
    If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. 
    NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). 
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    parser.add_option("-l", "--level", dest="levels", default=None,
                      metavar="LEVEL", action="append",
                      help=""" Level(s) to collect counts on. Use flag 
                      multiple times to specify multiple levels. If multiple 
                      values given, one table produced for each with rank 
                      name appended to file name. Levels can be an integer 
                      (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 
                      'ko', or 'ortholog' (which are all synonyms), or  
                      anything not synonymous with 'gene' to 
                      get CAZy groups. Defaults to ortholog/role and 
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_option('-s','--squash',dest='splitForLevels',
            default=True, action='store_false',
            help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.addPathOptions(parser)

    # log level and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    # Set defaults and check for some conflicts
    if options.levels is None and options.heirarchyFile is None:
        # using hit names only
        options.levels=[None]
    else:
        if options.heirarchyFile is None and options.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (options.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if options.levels is None:
            # set a default
            if options.heirarchyType is 'kegg':
                options.levels=['ko','1','2','pathway']
            if options.heirarchyType is 'seed':
                options.levels=['role','1','2','subsystem']
            else:
                options.levels=['gene','group']

        try:
            # Make sure the level list makes sense
            options.levels=cleanLevels(options.levels)
        except Exception as e:
            parser.error(str(e))

    # only print to stdout if there is a single input file
    if len(args)>1 and options.outfile is None:
        parser.error("STDOUT only works if a single input file is given!")


    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle == 'auto':
            with open(options.mapFile) as f:
                firstLine=f.next()
                while len(firstLine)==0 or firstLine[0]=='#':
                    firstLine=f.next()
            if koMapRE.search(firstLine):
                options.mapStyle='kegg'
            elif seedMapRE.search(firstLine):
                options.mapStyle='seed'
            elif tabMapRE.search(firstLine):
                options.mapStyle='tab'
            #elif cogMapRE.search(firstLine):
            #    options.mapStyle='cog'
            else:
                raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine))

        logging.info("Map file seems to be: %s" % (options.mapStyle))
        if options.mapStyle=='kegg':
            valueMap=kegg.parseLinkFile(options.mapFile)
        elif options.mapStyle=='seed':
            valueMap=kegg.parseSeedMap(options.mapFile)
        #elif options.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(options.mapFile)
        else:
            if options.parseStyle == hits.GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
        if len(valueMap)>0:
            logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next()))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap=None

    # set up level mapping
    levelMappers = [getLevelMapper(l,options) for l in options.levels]

    # parse input files
    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads)

        outhandle.write("Read\t%s\n" % ('\t'.join(options.levels)))
        for read, hitIter in hitMapIter:
            assignments=[]
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment=[]
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(assignment,options):
                    outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))
コード例 #6
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(parser,
                        defaults={
                            'filter_top_pct': 0,
                            'parseStyle': ACCS,
                            'countMethod': 'tophit'
                        },
                        choices={'countMethod': ('tophit', 'toporg')})
    parser.add_argument(
        "-P",
        "--proportional",
        dest="proportional",
        default=False,
        action="store_true",
        help="Assign reads that have multiple equal top hits to taxa such "
        "that the overal proportion of taxa is consistent with the "
        "unambiguious hits. This is meant for use with the 'toporg' "
        "count method.")
    parser.add_argument(
        "-i",
        "--individualFiles",
        dest="individual",
        default=False,
        action="store_true",
        help="Use this flag to process files independently. Normally, "
        "counts from all files are pooled for making choices.")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # load necessary maps
    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'toporg':
        (taxonomy, hitStringMap) = readMaps(arguments)

    wta = not (arguments.proportional)

    if len(arguments.input_files) <= 1 or arguments.individual:
        # loop over input
        for (inhandle, outhandle) in inputIterator(arguments):
            logging.debug("Reading from %s and writing to %s" %
                          (inhandle, outhandle))

            m8stream = M8Stream(inhandle)
            if arguments.countMethod == 'tophit':
                # don't give any taxonomy, just map to accessions for
                # redistribution
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    parseStyle=arguments.parseStyle)
            else:
                # translate to organism before finding most abundant
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    taxonomy=taxonomy,
                    hitStringMap=hitStringMap,
                    parseStyle=arguments.parseStyle)

            for line in readHits:
                outhandle.write(line)

    else:
        # process all files at once
        multifile = redistribute.multipleFileWrapper(arguments.input_files)

        # Build a map from input file name to output handle
        outputMap = {}
        for infile_handle in arguments.input_files:
            infile_name = infile_handle.name
            if arguments.output_file is None:
                outputMap[infile_name] = sys.stdout
            elif len(arguments.input_files) <= 1:
                outputMap[infile_name] = open(arguments.output_file, 'w')
            else:
                # use outfileName as suffix
                if arguments.cwd:
                    # strip path info first
                    (infilePath, infileFile) = os.path.split(infile_name)
                    outfile = "./" + infileFile + arguments.output_file
                else:
                    outfile = infile_name + arguments.output_file
                outputMap[infile_name] = open(outfile, 'w')

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                parseStyle=arguments.parseStyle)
        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=arguments.parseStyle)

        for (read, hit) in readHits:
            infile_name, read = read.split("/", 1)
            outhandle = outputMap[unquote_plus(infile_name)]
            outhandle.write(hit.line.split("/", 1)[1])

        if arguments.output_file is not None:
            for outhandle in outputMap.values():
                outhandle.close()
コード例 #7
0
def main():
    usage = "usage: %prog [OPTIONS] INPUT_FILE(S)"
    description = """
Takes an m8 blast and assigns each read to a pathway or gene family. Blast may be specified with -i or piped to STDIN.
    """
    parser = OptionParser(usage, description=description)
    parser.add_option("-i", "--inputfile", dest="infile",
                      metavar="INFILE", help="Read data table from INFILE"),
    addIOOptions(parser)
    parser.add_option('-O', "--outputStyle", default="cols",
                      choices=['cols','lines','python'],
                      help="How are multiple assignments displayed in output. By default ('cols'), multiple hits show up in multiple columns. The 'lines' option prints out a new line for each assignment. The 'python' option prints each assignment as a python string (in quotes) or a list of strings (in quotes, separted by commas, surrounded bya  pair of sqaure brackets).")
    parser.add_option("-m", "--mapFile", dest="mapFile",
                      metavar="MAPFILE", help="Location of file containing table of with db hit name as first column and geneIDs (Knumber) in second column.")
    parser.add_option("-M", "--mapStyle", default='auto', choices=['auto','kegg','tab'],
                      help="What type of mapping file are you using: simple tab separated list of IDs and kos, or the genes_ko.list file from KEGG (which adds ko: to the K numbers and can have multiple records for each gene id). By default, this script will inspect the file name and guess, but you can force either 'kegg' or 'tab' with this option.")
    parser.add_option("-p", "--parseStyle",
                      default=KEGG,
                      choices=[ACCS,GIS,KEGG,HITID,HITDESC],
                      help="What should be parsed from the hit table: accessions('accs'), 'gis', K numbers in description ('kegg'), the full hit name('hitid'), or the full hit description('hitdesc'). (defaults to '%default')")
    parser.add_option("-c", "--cutoff", dest="cutoff", type="float", default=0.01,
            help="Cutoff for showing paths or genes. If a fractional count for a path/gene is below this value, it will be labelled None.",
                  metavar="CUTOFF")

    # format and filterPct
    addHitTableOptions(parser)

    parser.add_option("-C", "--countMethod", dest="countMethod", default="all", choices=('first','most','all','consensus'),
                      help="How to deal with assignments from multiple hits. (first, most: can return multiple hits, all (default): return every hit, consensus: return None unless all the same)",
                    metavar="COUNTMETHOD")
    parser.add_option("-r","--filterForKO",action="store_true", dest="koHitsOnly", default=False, help="ignore hits with no KO assignment. This means reads with no hits to KO tagged sequences will not be in the output.")
    parser.add_option("-l","--level", dest="level", default="ko", choices=('ko','NAME','DEFINITION','EC','PATHWAY','1','2','3'), help="Either 'ko'; a string to look for in ko file ('PATHWAY','NAME', 'DEFINITION', or 'EC'); or level in kegg class heirarchy (1, 2, or 3 (should be same as PATHWAY))")
    parser.add_option("-k", "--koFile", dest="ko", metavar="KOFILE", default=None,
                      help="File containing kegg heirarchy (either ko or ko00001.keg)")
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if options.infile is None:
        infile = sys.stdin
    else:
        infile = open(options.infile)

    if options.parseStyle==KEGG:
        if options.mapFile is not None:
            logging.warn("Do you REALLY want to apply a mapping to KOs?")

    if options.level != 'ko':
        if options.ko is None:
            options.error("Please supply KEGG file if sepcifying a level other than 'ko' ")

        # read KEGG file
        koTranslation = readKEGGFile(options.ko, options.level)
    else:
        koTranslation = None

    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle=='kegg' or ( options.mapStyle=='auto' and len(options.mapFile)>=13 and options.mapFile[-13:]=='genes_ko.list'):
            valueMap=parseLinkFile(options.mapFile)
        else:
            if options.parseStyle == GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
    else:
        valueMap=None

    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMap = parseM8File(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.koHitsOnly,sortReads=options.hitTableSortReads)

        # manipulate mappings
        hitMap = applySimpleCutoff(hitMap, options.cutoff, koTranslation)

        log("maps complete for %d reads" % (len(hitMap)))

        # print out hit table
        outhandle.write("Read\tHit\n")
        if options.outputStyle=='python':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                outhandle.write("\t")
                outhandle.write(repr(hit))
                outhandle.write("\n")
        if options.outputStyle=='lines':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write(str(read))
                        outhandle.write("\t")
                        outhandle.write(str(h))
                        outhandle.write("\n")
                else:
                    outhandle.write(str(read))
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                    outhandle.write("\n")
        else:
             for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write("\t")
                        outhandle.write(str(h))
                else:
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                outhandle.write("\n")
コード例 #8
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description)
    add_IO_arguments(parser)
    parser.add_argument("-l",
                        "--level",
                        dest="levels",
                        default=None,
                        metavar="LEVEL",
                        action="append",
                        help=""" Level(s) to collect counts on. Use flag
                      multiple times to specify multiple levels. If multiple
                      values given, one table produced for each with rank
                      name appended to file name. Levels can be an integer
                      (1-3) for KEGG or SEED levels, any one of 'gene',
                      'role', 'family',
                      'ko', or 'ortholog' (which are all synonyms), or
                      anything not synonymous with 'gene' to
                      get CAZy groups. Defaults to ortholog/role and
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_argument(
        '-S',
        '--squash',
        dest='splitForLevels',
        default=True,
        action='store_false',
        help="Don't split assignment rows if gene maps to multiple pathways, "
        "just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.add_path_arguments(parser)

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # Set defaults and check for some conflicts
    if arguments.levels is None and arguments.heirarchyFile is None:
        # using hit names only
        arguments.levels = [None]
    else:
        if arguments.heirarchyFile is None \
                and arguments.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (arguments.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if arguments.levels is None:
            # set a default
            if arguments.heirarchyType is 'kegg':
                arguments.levels = ['ko', '1', '2', 'pathway']
            if arguments.heirarchyType is 'seed':
                arguments.levels = ['role', '1', '2', 'subsystem']
            else:
                arguments.levels = ['gene', 'group']

        try:
            # Make sure the level list makes sense
            arguments.levels = cleanLevels(arguments.levels)
        except Exception as e:
            parser.error(str(e))

    # map reads to hits
    if arguments.mapFile is not None:
        if arguments.mapStyle == 'auto':
            with open(arguments.mapFile) as f:
                firstLine = next(f)
                while len(firstLine) == 0 or firstLine[0] == '#':
                    firstLine = next(f)
            if koMapRE.search(firstLine):
                arguments.mapStyle = 'kegg'
            elif seedMapRE.search(firstLine):
                arguments.mapStyle = 'seed'
            elif tabMapRE.search(firstLine):
                arguments.mapStyle = 'tab'
            elif cogMapRE.search(firstLine):
                arguments.mapStyle = 'cog'
            else:
                raise Exception(
                    "Cannot figure out map type from first line:\n%s" %
                    (firstLine))

        logging.info("Map file seems to be: %s" % (arguments.mapStyle))
        if arguments.mapStyle == 'kegg':
            valueMap = kegg.parseLinkFile(arguments.mapFile)
        elif arguments.mapStyle == 'seed':
            valueMap = kegg.parseSeedMap(arguments.mapFile)
        elif arguments.mapStyle == 'cog':
            valueMap = kegg.parseCogMap(arguments.mapFile)
        else:
            if arguments.parseStyle == hits.GIS:
                keyType = int
            else:
                keyType = None
            valueMap = parseMapFile(arguments.mapFile,
                                    valueType=None,
                                    valueDelim=arguments.tab_map_delim,
                                    keyType=keyType)
        if len(valueMap) > 0:
            logging.info("Read %d items into map. EG: %s" %
                         (len(valueMap), next(iter(valueMap.items()))))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap = None

    # set up level mapping
    levelMappers = [getLevelMapper(lvl, arguments) for lvl in arguments.levels]

    # parse input files
    for (inhandle, outhandle) in inputIterator(arguments):
        logging.debug("Reading from %s and writing to %s" %
                      (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(
            inhandle,
            valueMap,
            hits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            ignoreEmptyHits=arguments.mappedHitsOnly)

        if arguments.levels == [None]:
            arguments.levels = ['Hit']
        outhandle.write("Read\t%s\n" % ('\t'.join(arguments.levels)))
        for read, hitIter in hitMapIter:
            assignments = []
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment = []
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(
                        assignment, arguments):
                    outhandle.write("%s\t%s\n" %
                                    (read, "\t".join(assignmentList)))
コード例 #9
0
def main():
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for
    these reads or just read names (-r). The -F filter limits which hits
    are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'mapFile': None,
            'parseStyle': ACCS,
            'filter_top_pct': -1,
            'countMethod': 'all',
            'taxdir': None})
    parser.add_argument(
        "-g",
        "--targetTaxonGroup",
        dest="group1",
        default=None,
        metavar="TAXON",
        action='append',
        help="Taxon to identify reads in. Top hits (as defined by "
             "--topHitPct) must be in this group. It can be a taxid, "
             "a name, or a file listing taxids. Use multiple times to "
             "specify a list of organisms. Use -a to specify whether "
             "all or at least one of the top hits must match.")
    parser.add_argument(
        "-a",
        "--any",
        default=False,
        action="store_true",
        help="If specified, accept reads where any top hit is to an organism "
             "in the target taxon/taxa. By default, all top hits must be "
             "in the target group.")
    parser.add_argument(
        '-t',
        '--topHitPct',
        default=0,
        type=float,
        help="How close(as a percentage to the best score a hit must be "
             "to qualify as a top hit. Default is 0, ie must have the best "
             "score. Use 100 to get all hits.")
    parser.add_argument(
        "-G",
        "--outerTaxonGroup",
        dest="group2",
        default=None,
        metavar="TAXON",
        action="append",
        help="Broader taxon to limit reads. All hits (use -F to limit "
             "these hits) must be in the target group or this group. Again, "
             "it can be a taxid, a name, or a file listing taxids. "
             "It can also be inkoved multiple times to choose multiple "
             "groups.")
    parser.add_argument(
        '-r',
        '--reads',
        default=False,
        action="store_true",
        help="Output just read names. By default, print the relevant hit "
             "lines for each read")

    # log level and help
    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # check args
    if arguments.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if arguments.taxdir is not None:
        taxonomy = readTaxonomy(arguments.taxdir, namesMap=True)
    else:
        taxonomy = None

    group_1_set = get_group_set(arguments.group1, taxonomy)
    group_2_set = get_group_set(arguments.group2, taxonomy)
    logging.debug(
        "Group 1 has %d entries and 439482 in group1 is %s" %
        (len(group_1_set), 439482 in group_1_set))
    if group_2_set is not None:
        logging.debug(
            "Group 2 has %d entries and 439482 in group2 is %s" %
            (len(group_2_set), 439482 in group_2_set))

    # map reads to hits
    if arguments.parseStyle == GIS:
        keyType = int
    else:
        keyType = None
    accToTaxMap = parseMapFile(
        arguments.mapFile,
        valueType=int,
        keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE = parsingREs.get(arguments.parseStyle, None)
    if arguments.parseStyle == ORGS:
        getTaxid = _getOrgTaxid
    elif arguments.parseStyle == HITID:
        getTaxid = _getHitidTaxid
    elif arguments.parseStyle == HITDESC:
        getTaxid = _getHitdescTaxid
    else:
        getTaxid = _getExprTaxid

    # for filtering:
    filterParams = FilterParams.create_from_arguments(arguments)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle, outhandle) in inputIterator(arguments):
        readCount = 0
        goodReadCount = 0
        printCount = 0

        # parse file
        for (
                read,
                hits) in filterM8Stream(
                inhandle,
                filterParams,
                return_lines=False):
            readCount += 1
            bestScore = 0
            hitTaxids = {}
            for hit in hits:
                score = hit.score
                taxids = []
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit, accToTaxMap, taxonomy):
                    if taxid is None:
                        break
                    if group_2_set is not None and taxid not in group_2_set:
                        break
                    taxids.append(taxid)
                if len(taxids) == 0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit] = taxids

                # find the top score
                if score > bestScore:
                    bestScore = score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug(
                    "Checking best hits for %s (top score: %.1f)" %
                    (read, bestScore))
                all = True
                recognized = []
                for hit, taxids in _getBestHitTaxids(
                        hitTaxids, bestScore, arguments.topHitPct):
                    if _anyTaxidInGroup(taxids, group_1_set):
                        logging.debug("%s (%r)  is in group 1" % (hit, taxids))

                        recognized.append(hit)
                    else:
                        logging.debug(
                            "%s (%r) is not in group 1" %
                            (hit, taxids))
                        all = False
                if len(recognized) == 0:
                    # if none of the best are in our target list, next read
                    logging.debug(
                        "No best hits for %s are in group 1" %
                        (read))
                    continue
                if (not arguments.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug(
                        "Not all best hits for %s are in group 1" %
                        (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount += 1
                if arguments.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug(
                        "Keeping %d hits for %s" %
                        (len(recognized), read))
                    for hit in sorted(
                        recognized,
                        key=lambda h: (
                            h.score,
                            h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount += 1

        if arguments.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount, readCount))
        else:
            logging.info(
                "Printed %d lines for %d of %d reads" %
                (printCount, goodReadCount, readCount))
コード例 #10
0
def main():
    """ The CLI """
    description = """
Takes a hit table (reads searched against a database) and assigns
each read to a taxon. Hit table may be specified with -i or piped to STDIN.

    Notes:

     * Specifying a top score precent (-F) will force hits to be sorted
       by score within each read. However, it is assumed that the hits in
       the input table(s) are already grouped by read. This program does
       not attempt to sort the entire input.
    """
    parser = argparse.ArgumentParser(description)
    util.add_IO_arguments(parser)
    parser.add_argument("-T", "--taxids", default=False, action="store_true",
                        help="Output taxids instead of names")
    edlhits.add_taxon_arguments(parser)
    parser.add_argument(
        "-r",
        "--rank",
        dest="rank",
        default=None,
        metavar="RANK",
        help=" Rank to collect counts on.  Defaults to None (whatever "
             "the annotation was). Corresponds to rank names in nodes.dmp. "
             "To see list run: 'cut -f5 nodes.dmp | uniq | sort | uniq' in "
             "ncbi tax dir")
    parser.add_argument(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. "
             "Will be ignored if beyond the rank of the taxa "
             "(IE We can't include species if the taxon being counted "
             "is genus)")
    parser.add_argument(
        "--no-header",
        dest="no_header",
        default=False,
        action='store_true',
        help="do not write header line")

    util.add_universal_arguments(parser)
    arguments = parser.parse_args()
    util.setup_logging(arguments)

    logging.debug("Parsing style is: %s", arguments.parseStyle)

    # Handle the case where Galaxy tries to set None as a string
    arguments.printRanks = util.checkNoneOption(arguments.printRanks)

    # check arguments
    if arguments.taxids and arguments.taxdir is None:
        parser.error("Only use -T when a taxonomy is specified")
    if arguments.rank is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-r).")
    if arguments.printRanks is not None and arguments.taxdir is None:
        parser.error(
            "Please supply NCBI phylogeny(-n) if specifying a rank(-R).")
    if arguments.rank is not None:
        if arguments.rank == 'domain':
            logging.warning('translating domain to superkingdom')
            arguments.rank = 'superkingdom'
        if arguments.rank not in ranks:
            parser.error("Unknown rank: %s" % (arguments.rank))

    try:
        # Make sure the rank lists make sense
        if arguments.printRanks is not None:
            arguments.printRanks = cleanRanks(arguments.printRanks)
    except Exception as exc:
        parser.error(str(exc))

    # load necessary maps
    (taxonomy, value_map) = edlhits.readMaps(arguments)

    # loop over inputs
    for (inhandle, outhandle) in util.inputIterator(arguments):
        logging.debug(
            "Reading from %s and writing to %s",
            inhandle, outhandle)
        hit_iter = edlhits.parseM8FileIter(
            inhandle,
            value_map,
            edlhits.FilterParams.create_from_arguments(arguments),
            arguments.parseStyle,
            arguments.countMethod,
            taxonomy=taxonomy,
            rank=arguments.rank)

        ##
        # print output
        # choose output method
        if arguments.taxids:
            hit_header = 'taxid'
            printer = taxid_printer
        else:
            if arguments.printRanks is None:
                hit_header = 'Hit(s)'
                printer = default_printer
            else:
                hit_header = '\t'.join(arguments.printRanks)

                def printer(read, hits):
                    " Inline function to reduce number of arguments "
                    return tax_table_printer(read,
                                             hits,
                                             arguments.rank,
                                             arguments.printRanks)

        # loop over reads
        if not arguments.no_header:
            outhandle.write("Read\t{}\n".format(hit_header))
        for (read, hits) in hit_iter:
            outhandle.write(printer(read, hits))
コード例 #11
0
def main():
    description = __doc__
    parser = argparse.ArgumentParser(description=description)
    add_IO_arguments(parser)
    add_taxon_arguments(
        parser,
        defaults={
            'filter_top_pct': 0,
            'parseStyle': ACCS,
            'countMethod': 'tophit'},
        choices={
            'countMethod': (
                'tophit',
                'toporg')})
    parser.add_argument(
        "-P",
        "--proportional",
        dest="proportional",
        default=False,
        action="store_true",
        help="Assign reads that have multiple equal top hits to taxa such "
             "that the overal proportion of taxa is consistent with the "
             "unambiguious hits. This is meant for use with the 'toporg' "
             "count method.")
    parser.add_argument(
        "-i",
        "--individualFiles",
        dest="individual",
        default=False,
        action="store_true",
        help="Use this flag to process files independently. Normally, "
             "counts from all files are pooled for making choices.")

    add_universal_arguments(parser)
    arguments = parser.parse_args()
    setup_logging(arguments)

    # load necessary maps
    params = FilterParams.create_from_arguments(arguments)
    if arguments.countMethod == 'toporg':
        (taxonomy, hitStringMap) = readMaps(arguments)

    wta = not(arguments.proportional)

    if len(arguments.input_files) <= 1 or arguments.individual:
        # loop over input
        for (inhandle, outhandle) in inputIterator(arguments):
            logging.debug(
                "Reading from %s and writing to %s" %
                (inhandle, outhandle))

            m8stream = M8Stream(inhandle)
            if arguments.countMethod == 'tophit':
                # don't give any taxonomy, just map to accessions for
                # redistribution
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    parseStyle=arguments.parseStyle)
            else:
                # translate to organism before finding most abundant
                readHits = redistribute.pickBestHitByAbundance(
                    m8stream,
                    filterParams=params,
                    returnLines=True,
                    winnerTakeAll=wta,
                    taxonomy=taxonomy,
                    hitStringMap=hitStringMap,
                    parseStyle=arguments.parseStyle)

            for line in readHits:
                outhandle.write(line)

    else:
        # process all files at once
        multifile = redistribute.multipleFileWrapper(arguments.input_files)

        # Build a map from input file name to output handle
        outputMap = {}
        for infile_handle in arguments.input_files:
            infile_name = infile_handle.name
            if arguments.output_file is None:
                outputMap[infile_name] = sys.stdout
            elif len(arguments.input_files) <= 1:
                outputMap[infile_name] = open(arguments.output_file, 'w')
            else:
                # use outfileName as suffix
                if arguments.cwd:
                    # strip path info first
                    (infilePath, infileFile) = os.path.split(infile_name)
                    outfile = "./" + infileFile + arguments.output_file
                else:
                    outfile = infile_name + arguments.output_file
                outputMap[infile_name] = open(outfile, 'w')

        if arguments.countMethod == 'tophit':
            # don't give any taxonomy, just map to accessions for
            # redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                parseStyle=arguments.parseStyle)
        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=wta,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=arguments.parseStyle)

        for (read, hit) in readHits:
            infile_name, read = read.split("/", 1)
            outhandle = outputMap[unquote_plus(infile_name)]
            outhandle.write(hit.line.split("/", 1)[1])

        if arguments.output_file is not None:
            for outhandle in outputMap.values():
                outhandle.close()