Python addUniversalOptions Examples

Programming Language: Python

Namespace/Package Name: edl.util

Method/Function: addUniversalOptions

Examples at hotexamples.com: 11

Python addUniversalOptions - 11 examples found. These are the top rated real world Python examples of edl.util.addUniversalOptions extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: identifyReads.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog -O ORTHOLOGY [OPTIONS] BLAST_M8_FILES"
    description = """
    Given two lists of taxids and one or more hit tables, identify reads that:
     (1) have their best hits in taxid list 1
     (2) have all other hits in either list

    Finally, print out either the hits (that match the target group) for these reads or just read names (-r). The -F filter limits which hits are used in part (2) as well as which are printed.

    The countMethod (-C) option is not used.
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addTaxonOptions(parser,defaults={'mapFile':None,'parseStyle':ACCS,'filterPct':-1,'countMethod':'all','taxdir':None})
    parser.add_option("-g", "--targetTaxonGroup", dest="group1", default=None, metavar="TAXON", action='append',
                      help="Taxon to identify reads in. Top hits (as defined by --topHitPct) must be in this group. It can be a taxid, a name, or a file listing taxids. Use multiple times to specify a list of organisms. Use -a to specify whether all or at least one of the top hits must match.")
    parser.add_option("-a","--any", default=False, action="store_true", help="If specified, accept reads where any top hit is to an organism in the target taxon/taxa. By default, all top hits must be in the target group.")
    addUniversalOptions(parser)
    parser.add_option('-t','--topHitPct', default=0, type='float',
                      help='How close (as a %) to the best score a hit must be to qualify as a top hit. Default is 0, ie must have the best score. Use 100 to get all hits.')
    parser.add_option("-G", "--outerTaxonGroup", dest="group2", default=None, metavar="TAXON", action="append",
                      help="Broader taxon to limit reads. All hits (use -F to limit these hits) must be in the target group or this group. Again, it can be a taxid, a name, or a file listing taxids. It can also be inkoved multiple times to choose multiple groups.")
    parser.add_option('-r','--reads', default=False, action="store_true",
                      help="Output just read names. By default, print the relevant hit lines for each read")

    (options, args) = parser.parse_args()

    if options.about:
        print description
        exit(0)

    # check args
    setupLogging(options,description)
    if options.group1 is None:
        parser.error("Please use -g to specify a target taxonomic group")

    if options.taxdir is not None:
        taxonomy = readTaxonomy(options.taxdir, namesMap=True)
    else:
        taxonomy = None

    group1Map=getGroupMap(options.group1,taxonomy)
    group2Map=getGroupMap(options.group2,taxonomy)
    logging.debug("Group 1 has %d entries and 439482 in group1 is %s" % (len(group1Map),group1Map.get(439482,False)))
    if group2Map is not None:
        logging.debug("Group 2 has %d entries and 439482 in group2 is %s" % (len(group2Map),group2Map.get(439482,False)))

    # map reads to hits
    if options.parseStyle==GIS:
        keyType=int
    else:
        keyType=None
    accToTaxMap = parseMapFile(options.mapFile,valueType=int,keyType=keyType)

    # set up some function pointers
    global hitRE
    hitRE=parsingREs.get(options.parseStyle,None)
    if options.parseStyle == ORGS:
        getTaxid=_getOrgTaxid
    elif options.parseStyle == HITID:
        getTaxid=_getHitidTaxid
    elif options.parseStyle == HITDESC:
        getTaxid=_getHitdescTaxid
    else:
        getTaxid=_getExprTaxid

    # for filtering:
    filterParams = FilterParams.createFromOptions(options)
    logging.debug(repr(filterParams))

    # loop over hit tables
    for (inhandle,outhandle) in inputIterator(args,options):
        readCount=0
        goodReadCount=0
        printCount=0

        # parse file
        for (read,hits) in filterM8Stream(inhandle, filterParams, returnLines=False):
            readCount+=1
            bestScore=0
            hitTaxids={}
            for hit in hits:
                score=hit.score
                taxids=[]
                # does this hit have at least one associated taxid in group2?
                for taxid in getTaxid(hit,accToTaxMap,taxonomy):
                    if taxid is None:
                        break
                    if group2Map is not None and not group2Map.get(taxid,False):
                        break
                    taxids.append(taxid)
                if len(taxids)==0:
                    # nothing matched in the wider group
                    break
                hitTaxids[hit]=taxids

                # find the top score
                if score>bestScore:
                    bestScore=score
            else:
                # if we get here, then every hit was in wider taxon list
                logging.debug("Checking best hits for %s (top score: %.1f)" % (read,bestScore))
                all=True
                recognized=[]
                for hit,taxids in _getBestHitTaxids(hitTaxids,bestScore,options.topHitPct):
                    if _anyTaxidInGroup(taxids,group1Map):
                        logging.debug("%s (%r)  is in group 1" % (hit,taxids))

                        recognized.append(hit)
                    else:
                        logging.debug("%s (%r) is not in group 1" % (hit,taxids))
                        all=False
                if len(recognized)==0:
                    # if none of the best are in our target list, next read
                    logging.debug("No best hits for %s are in group 1" % (read))
                    continue
                if (not options.any) and (not all):
                    # next read unless user said any or all hits are in list
                    logging.debug("Not all best hits for %s are in group 1" % (read))
                    continue

                # if we get here, then the read is a match
                goodReadCount+=1
                if options.reads:
                    logging.debug("Keeping %s" % (read))
                    outhandle.write(read)
                    outhandle.write('\n')
                else:
                    logging.debug("Keeping %d hits for %s" % (len(recognized),read))
                    for hit in sorted(recognized,key=lambda h: (h.score,h.hit)):
                        outhandle.write(hit.getLine(filterParams))
                        printCount+=1

        if options.reads:
            logging.info("Printed %d of %d reads" % (goodReadCount,readCount))
        else:
            logging.info("Printed %d lines for %d of %d reads" % (printCount,goodReadCount, readCount))

Example #2

Show file

File: sunburstFromMeganCSV.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog OPTIONS JSON_FILE(s)"
    description = __doc__
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addUniversalOptions(parser)

    parser.add_option(
        "-r", "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree"
    )
    parser.add_option(
        "-c",
        "--colors",
        default=None,
        help="Set colors by mapping taxon names to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry.",
    )
    parser.add_option(
        "-C",
        "--cutoff",
        default=0.025,
        type="float",
        help="Trim nodes below this value. Interpreted as an absolute threshold if >1 and as fractional if <1. Set to 0 (or less) to turn off.",
    )

    parser.add_option(
        "-R", "--ranks", default="superkingdom,phylum,family", help="Ranks to inclued in sunburst, default: %default"
    )
    parser.add_option(
        "-n",
        "--ncbiTaxDir",
        dest="taxdir",
        metavar="PATH",
        default=None,
        help="Directory with unpacked ncbi tax dump (specifically names.dmp and nodes.dmp) and use to translate taxids into taxa. ",
    )
    parser.add_option(
        "-i",
        "--icicle",
        default=False,
        action="store_true",
        help="Print stacked bars in rectangular coordinates, not polar.",
    )
    parser.add_option(
        "-e",
        "--exterior_labels",
        default=False,
        action="store_true",
        help="Print labels for outermost nodes outside image",
    )
    parser.add_option(
        "-s",
        "--sortKey",
        default=[],
        action="append",
        choices=[NAME, COLOR, VALUE],
        help='how to sort nodes. Defaults to "color" and "name"',
    )

    parser.add_option(
        "-S", "--figsize", default=None, help="Comma separated pair of numbers (in inches) for figure size"
    )

    parser.add_option(
        "-f",
        "--format",
        dest="format",
        default="pdf",
        choices=["png", "ps", "pdf", "svg"],
        help="Format for output image",
        metavar="FORMAT",
    )
    parser.add_option(
        "-J", "--JSON", default=False, action="store_true", help="output JSON tree of counts instead of figure"
    )

    (options, args) = parser.parse_args()

    # check arguments
    setupLogging(options, description)

    if not options.JSON:
        # setup matplotlib
        backend = options.format
        if backend == "png":
            backend = "agg"
        matplotlib.use(backend)
        import matplotlib.pyplot as plt

    # load taxonomy
    if options.taxdir is None:
        parser.error("You must supply the location of the NCBI tax dump files")
    taxonomy = readTaxonomy(options.taxdir)

    # build rank list
    ranks = options.ranks.split(",")

    if options.JSON:
        # STandard iterator that returns handles
        inputIterator = inputIteratorNormal

        if len(options.sortKey) > 0:
            logger.warn("the SORT option has no effect on JSON output")
    else:
        # version that defaults to adding format as suffix and returns name
        inputIterator = inputIteratorFig

    # proecss user selected options
    kwargs = processOptions(options)

    # process input files
    for (inhandle, outfile) in inputIterator(args, options):
        # load counts
        counts = {}
        for line in inhandle:
            (taxid, count) = line.rstrip("\n\r").split(",")
            if taxid == "None":
                tax = None
            else:
                tax = taxonomy.idMap.get(int(taxid), None)
            counts[tax] = counts.get(tax, 0) + int(count)

        # convert to JSON
        (nxtree, root) = convertToNx(counts, leaves=True, ranks=ranks)
        tree = convertToJSON(nxtree, root)

        # process JSON
        if options.colors is not None:
            setColors(tree, options.colors, **kwargs)
        if options.root is not None:
            newRoot = findNode(tree, options.root, **kwargs)
            if newRoot is not None:
                tree = newRoot

        total = applyCutoff(tree, options.cutoff, **kwargs)
        if options.JSON:
            putNodeCountsInOther(tree)
            outfile.write(json.dumps(tree, indent=2))
        else:
            # some of the matplotlib functions don't like extra arguments
            kwargs.pop(ID)

            # create figure
            plotSunburstJSON(tree, **kwargs)

            # save to file
            plt.savefig(outfile, format=options.format)

Example #3

Show file

File: simpleBarPlot.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog OPTIONS"
    description = """
Generates a simple bar plot from a table of data. By default first two columns are taken as labels and data. Can alternatively transpose and take selected row/column.
    """
    parser = OptionParser(usage, description=description)
    addUniversalOptions(parser)
    parser.add_option("-i", "--inputfile", dest="infile",
                      metavar="INFILE", help="Read data table from INFILE"),
    parser.add_option("-o", "--others", dest="others", default=False, action="store_true", help="include sum of values under cutoff")
    parser.add_option("-p", "--pct", dest="pct", default=False, action="store_true", help="plot percents (before cutoff) instead of counts")
    parser.add_option("-t", "--transpose",
                  action="store_true", dest="transpose", default=False,
                  help="Transpose data")
    parser.add_option("-c", "--cutoff", dest="cutoff", type="float", default=0.025,
		  help="data cutoff value (default .025)",
                  metavar="CUTOFF")
    parser.add_option("-d", "--dataColumn", dest="dataCol", default='0',
            help="Index (starting at 0) or name of column with data to be plotted",
                  metavar="HITCOL")
    parser.add_option("-f", "--format", dest="format", default='pdf', choices=['png','ps','pdf','svg'],
		  help="Format for output image", metavar="FORMAT")
    parser.add_option("-O","--outputFile", default=None, help="Manually set output file name")
    parser.add_option("-N","--outputNameOnly", default=False, action='store_true', help="Only print name of file to create. Don't do anything else")

    (options, args) = parser.parse_args()

    # check arguments
    setupLogging(options, description)

    if options.infile is None:
        options.error("Please supply a blast file name!")

    if options.outputFile is None:
        ofbits = [options.infile,'1dBarPlot']
        if options.transpose:
            ofbits.append('T')
        if options.pct:
            ofbits.append('P')
        if options.others:
            ofbits.append('O')
        ofbits.extend([options.dataCol,str(options.cutoff),options.format])
        outfile = '.'.join(ofbits)
    else:
        outfile=options.outputFile

    backend = options.format
    if backend=='png':
        backend='agg'
    matplotlib.use(backend)
    import matplotlib.pyplot as plt

    print outfile
    if options.outputNameOnly:
        sys.exit(0)

    log("Data in: %s\nImage out: %s" % (options.infile,outfile))
    log("Output format: %s" % options.format)

    series = makeSeriesFromFile(options.infile,options.dataCol, options.transpose, options.cutoff, options.others, options.pct)

    # create list of values over cutoff and sum of others
    labels = series.index

    # plot
    fig = plt.figure()
    ax =fig.add_axes([.1,.4,.8,.5])
    x = range(len(series))
    ax.bar(x,series)
    plt.xticks([i+.5 for i in x],labels,size=7,rotation=-90)

    plt.savefig(outfile,format=options.format)

Example #4

Show file

File: runMetagnomicPipelineOnMiSeqSamples.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] RUN_NAME [RUN_NAME ...]"
    description="""
Given a list of runs and an optional regular expression for sample names
run the samples through the Metagenomic pipeline in Galaxy
"""
    parser = OptionParser(usage, description=description)
    parser.add_option('-p','--pipelineVersion', default="",
                      help="VErsion of the 'MG Pipeline' to run")
    parser.add_option('-P', '--pipelineName', default=None,
                      help="Name of pipeline (wihout version string)")
    parser.add_option('-u', '--api_url', 
                      default="https://localhost/api",
                      help="URL of Galaxy API")
    parser.add_option('-k', '--api_key', default=None,
                      help="Galaxy API key for connecting. REQUIRED!")
    parser.add_option('-s', '--sample_regex', default=None,
                      help="Expression for matching sample names. If empty, all samples in given runs are processed")
    parser.add_option('-c', '--chemistry', dest='chem', default=None,
                      help="Override chemistry from SampleSheet with this value. One of 'truseq','scriptseq',or 'nextera'",
                      choices=['truseq','scriptseq','nextera'])

    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if options.sample_regex is None:
        sampleRE=None
    else:
        sampleRE=re.compile(options.sample_regex)

    if options.pipelineName is None:
        parser.error("Please supply a workflow name!")
    wfName=options.pipelineName + options.pipelineVersion
    hpref=re.sub(r'[^A-Z0-9]','',options.pipelineName)
    if options.pipelineVersion != "":
        hprep += + '.' + re.sub(r'[^A-Z0-9]','',options.pipelineVersion)

    logging.debug("SampleRE:\t%r\nworkflow:\t%s\nhistPref:\t%s" % (sampleRE, wfName, hpref))

    if options.api_key is None:
        key = edl.galaxy.getApiKey()
        if key is None:
            parser.error("You must speicfy an API key with the -k flag!")
        else:
            options.api_key = key

    total=0
    for runName in args:
        wfcount=0
        for r in edl.galaxy.launchWorkflowOnSamples(options.api_key, runName,
                                                    workflowName=wfName, 
                                                    sampleRE=sampleRE,
                                                    apiURL=options.api_url,
                                                    historyPrefix=hpref,
                                                    chemistry=options.chem):
            logging.debug(repr(r))
            if 'error' in r:
                logging.warn(r['error'])
            else:
                wfcount+=1

        logging.info("Launched workflow on %d samples in %s" % (wfcount,runName))
        total+=wfcount

    logging.info("Launched workflow %s on %d samples from %s runs" % (wfName, total, len(args)))

Example #5

Show file

File: downloadFromPipelineHistories.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] EXPR [EXPR ...]"
    description="""
Given a list of regular expressions pulls the Nth (default is 24th) dataset from each matching history. If this is run on a machine that can access the galaxy files directory, it will create symlinks to the originals unles the "-c" option is given.
"""
    parser = OptionParser(usage, description=description)
    parser.add_option('-u', '--api_url', 
                      default="https://localhost/api",
                      help="URL of Galaxy API")
    parser.add_option('-k', '--api_key', default=None,
                      help="Galaxy API key for connecting. REQUIRED!")
    parser.add_option('-d', '--dataset_index', default=-1, type='int',
                      help="which dataset to get in each history")
    parser.add_option('-D', '--dataset_regex', default=None,
                      help="Expression for matching dataset names. If set, overrides the dataset_index (-d)")
    parser.add_option("-S","--sameDir", default=False, action='store_true',
            help="Save all datasets to same directory, don't created subdirs for each history")
    parser.add_option("-O",'--saveDir', default=".",
            help="Direcotry in which to save files. A subdirectory will be created for each matchin history. Defaults to the current directory")
    parser.add_option("-o", "--outfileName", default=None,
                     help="If set, give this name to output files, otherwise pull name from galaxy")
    parser.add_option("-C", "--chunk_size", default=1024, type='int', 
            help="Chunk size for file downloads. Bigger should speed up the download of large files, but slow down lots of small files. Defaults to 1024")
    parser.add_option("-c", "--force_copy", default=False, action='store_true',
            help="Copy data files even if symlinks are possible")

    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if options.api_key is None:
        key = edl.galaxy.getApiKey()
        if key is None:
            parser.error("You must speicfy an API key with the -k flag!")
        else:
            options.api_key = key

    # set up dataset search values
    if options.dataset_regex is None:
        if options.dataset_index == -1:
            parser.error('Please supply a dataset number or dataset string with -d or -D!')
        dsNum=options.dataset_index
        dsName=None
    else:
        dsNum=None
        dsName=re.compile(options.dataset_regex)

    logging.debug("Looking for dataset: %r/%r" % (options.dataset_regex, 
                                                  options.dataset_index))

    # Are we on the same machine as galaxy?
    filesAreLocal = re.search(r'://(localhost|127\.0\.0\.1)',options.api_url) is not None
    logging.debug("URL seems local, will attempt to link")

    for (history,dataset) in edl.galaxy.findDatasets(options.api_key, args,
                                                     dsName=dsName,
                                                     dsNum=dsNum,
                                                     apiURL=options.api_url):

        # create dir for history if it doesn't already exist
        hdir = _get_output_dir(options, history)

        # generate name for downloaded/linked file
        out_file_name = _get_output_file(options,
                hdir, dataset)

        # Create symlink if we can find the file locally
        if filesAreLocal and 'file_path' in dataset and not options.force_copy:
            logging.debug("Linking dataset")
            originalFile=dataset['file_path']
            os.symlink(originalFile, out_file_name)
        else:
            # Copy to local file from download URL
            url = dataset['download_url']
            response = requests.get(url, stream=True, verify=False)
            logging.debug(repr(dataset))
            logging.debug("Copying data from:\n%s" % (url))
            logging.info("Copyting to: %s" % (out_file_name))
            #with open(out_file_name, 'wb') as out_file:
            #    shutil.copyfileobj(response.raw, out_file)
            with open(out_file_name, 'wb') as out_file:
                for chunk in response.iter_content(options.chunk_size):
                    out_file.write(chunk)
            del response

Example #6

Show file

File: countHits.py Project: JessAwBryant/py-metagenomics

def main():
    ## set up CLI
    usage = "usage: %prog [options]"
    description = """
    Count hits in a table with read and hit names.
    """

    parser = OptionParser(usage, description=description)
    parser.add_option("-i", "--infile", dest="infile",
                      metavar="FILE", help="Read raw table from INFILE")
    parser.add_option("-o", "--outfile", dest="outfile",
                      metavar="OUTFILE", help="Write collapsed table to OUTFILE")
    parser.add_option("-d", "--delim", dest="delim", default="\t",
                      help="Input table delimiter", metavar="DELIM")
    parser.add_option("-D", "--delimOut", dest="delimOut", default="\t",
                      help="Output table delimiter", metavar="DELIM")
    parser.add_option('-F', '--countFirst', action='store_true', default=False,
                       help="Don't skip the first line, it's NOT a header")
    parser.add_option("-R", "--readColumn", dest="readCol", type="int", default=0,
                      help="Index (starting at 0) of column with read name, 0 is default",
                      metavar="READCOL")
    parser.add_option("-H", "--hitColumn", dest="hitCol", type="int", default=2,
                      help="Index (starting at 0) of column with hit name (for counting), 2 is default, if less than zero, all (non-read) columns will be used as multiple hits",
                      metavar="HITCOL")
    parser.add_option('-s', '--hitSep', default=None,
                      help="Use this string to split multiple values in single hit cell. Default is 'None' to leave hits as is, use 'eval' to parse as python repr strings")
    addWeightOption(parser, multiple=False)
    parser.add_option("-T", "--total", default=False, action="store_true",
                      help="Report 'Total' in the first row")

    # cutoff options
    addCountOptions(parser,{'cutoff':0})

    # logging and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    # make sure we have something to do
    if (options.infile==None):
        logging.info("Reading table from: STDIN")
    else:
        logging.info ("Reading table from: " + options.infile )

    if (options.outfile==None):
        logging.info("Writing counts to: STDOUT")
    else:
        logging.info ("Writing counts to: " + options.outfile )

    # process arguments
    takeFirst = (options.allMethod == 'first')
    splitHits = (options.hitSep is not None and options.hitSep != 'None')
    uncluster = (options.weights != None)

    if options.hitSep=='eval':
        parser.error("Sorry, parsing with eval is not yet supported!")

    ## inform the curious user
    logging.info ("Delimiter: '" + options.delim )
    logging.info ("Read names in col: '" + str(options.readCol) )
    logging.info ("Hit names in col: '" + str(options.hitCol) )
    if splitHits:
        logging.info("Splitting hits with: %s" % (options.hitSep))
        logging.warn("Splitting hits has not been tested yet! Let me know how it goes.")
    if takeFirst:
        logging.info("Taking first hit for each read.");
    else:
        if options.allMethod == 'portion':
            logging.info ("Dividing count among all hits for each read.")
        else:
            logging.info ("Adding 1 to every hit for each read")
    if uncluster:
        logging.info("Getting read cluster sizes from: %s" % (options.weights));
    if options.countFirst:
        logging.info("First line is data")
    else:
        logging.info("Skipping first line")

    # Do the counting!
    counts = {}
    countHitsForRead=getAllMethod(options.allMethod)

    clusteredReadCounts={}
    if uncluster:
        clusteredReadCounts = parseMapFile(options.clusterFile, valueType=int)

    currentRead=''
    readCount=1
    hits=[]

    if options.infile is None:
        infile = sys.stdin
    else:
        infile = open(options.infile)

    # loop over lines
    if not options.countFirst:
        # skip first line
        try:
            infile.next()
        except StopIteration:
            raise Exception("No lines in %s" % str(infile))

    for line in infile:
        line=line.rstrip('\r\n')
        rowcells = line.split(options.delim)
        # get read
        read = rowcells[options.readCol]

        # if it's a new read, process previous read
        if currentRead=='':
            currentRead=read
        elif read != currentRead and currentRead != '':
            readCount+=1
            logging.info( "Checking hits for %s" % currentRead)

            # was it part of a cluster?
            multiplier = 1
            if uncluster:
                multiplier = clusteredReadCounts[currentRead]

            # where does the count for this read go
            countHitsForRead(hits, counts, multiplier=multiplier)

            hits=[]
            currentRead=read

        # get hit from this line
        if options.hitCol>=0:
            hit=rowcells[options.hitCol]
            if splitHits:
                hits.extend(hit.split(options.hitSep))
            else:
                hits.append(hit)
        else:
            rowcells.pop(options.readCol)
            hits.extend(rowcells)

    # check last read!
    logging.info( "Checking hits for %s" % currentRead)
    # was it part of a cluster?
    multiplier = 1
    if uncluster:
        multiplier = clusteredReadCounts[currentRead]
    # where does the count for this read go
    countHitsForRead(hits,counts,multiplier=multiplier)

    # apply cutoff
    if options.cutoff>0:
        applyFractionalCutoff(counts, threshold=options.cutoff*readCount)

    # print output
    if options.outfile is None:
        outhandle = sys.stdout
    else:
        outhandle = open(options.outfile,'w')

    if options.total:
        outhandle.write("Total%s%d\n" % (options.delimOut, readCount))

    if options.allMethod=='portion':
        outFmtString = "%s%s%f\n"
    else:
        outFmtString = "%s%s%d\n"

    delimRE = re.compile(options.delimOut)
    for hit, count in counts.iteritems():
        hit=delimRE.sub('_',hit)
        outhandle.write(outFmtString % (hit,options.delimOut,count))

Example #7

Show file

File: filter_blast_m8.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] BLAST_FILE"
    description = """
    Take a blast result table and output a subset of hits based on the chosen filtering options. If more than one blast file given, use -O to get multiple output files, otherwise all output data will be concatenated into one output.
    """

# command line options
    parser = OptionParser(usage, description=description, conflict_handler='resolve')
    addHitTableOptions(parser, flags='all')
    parser.add_option("-o", "--outfilenome", dest="outfilename", default=None,
                      metavar="OUTFILENAME", help="Write masked fasta output to OUTFILENAME.")
    parser.add_option('-O', '--autoOutName', default=False,
                      action='store_true',
                      help="Automatically generate output file name from input name and options. Overridden by -o, cannot be used with data from STDIN.")

    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options,description)

    if options.hitTableFormat=='last':
        if options.hitTableSort=='evalue':
            parser.error("The last format has no evalue to sort by, sorry")

    # check that we have blast file as argument
    if len(args) <= 1:
        # input
        if len(args) == 1:
            infile = args[0]
            logging.info("reading data from %s" % (infile))
            instream = open(infile,'rU')
        else:
            infile = './stdin'
            logging.info("reading data from STDIN")
            instream=sys.stdin

        # output
        if options.outfilename is not None:
            logging.info("Writing data to %s" % (options.outfilename))
            outstream=open(options.outfilename,'w')
        elif options.autoOutName:
            outfile=getOutputFile(infile,options)
            logging.info("Writing data to %s" % (outfile))
            outstream=open(outfile,'w')
        else:
            logging.info("writing data to STDOUT")
            outstream=sys.stdout

        # filter
        params=FilterParams.createFromOptions(options)
        filterM8(instream,outstream,params)
    else:
        if not options.autoOutName:
            if options.outfilename is not None:
                logging.info("Writing data to %s" % (options.outfilename))
                outstream=open(options.outfilename,'w')
            else:
                logging.info("writing data to STDOUT")
                outstream=sys.stdout
        for infilename in args:
            logging.info("reading data from %s" % (infilename))
            instream=open(infilename,'rU')
            if options.autoOutName:
                outstream=open(getOutputFile(infilename,options),'w')

            # filter
            params=FilterParams.createFromOptions(options)
            filterM8(instream,outstream,params)

            if options.autoOutName:
                outstream.close()
            instream.close()

Example #8

Show file

File: assignPaths.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]"
    description = """
Takes a single m8 blast file and generates a table (or tables) of pathway/gene family assignments for the query sequences (aka 'reads'). Assignments can be for gene families, gene classes, or pathways. Multiple pathway or classification levels can be given. If they are, an assignment will be made at each level.
    This differs from assignPathsToReadsFromBlast.py in that: (1) it can handle CAZy and SEED, (2) it will output multiple levels in one file, (3) multiple assignments are always printed on multiple lines.
    This script will work with KEGG, SEED, or CAZy. CAZy only has one level of heirarchy, the others have 3. The CAZy heirarchy is apparent from the hit name and needs no supporting files. KEGG and SEED require mapping files to identify gene families and heirachy files to report levels other than the gene family or ortholog level. Both SEED and KEGG have three levels of classifications that can be indicated with a 1, 2, or 3. The words "subsystem" and "pathway" are synonyms for level 3.
    If a count method is selected that can produce multiple assignments per read, each assignment will be printed on a new line. 
    NOTE: in KEGG (and SEED) a single ortholog (role) may belong to multiple pathways (subsystems). A hit to such an ortholog will result in extra assignment values for that query sequence (1 for each pathway it belongs to). 
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    parser.add_option("-l", "--level", dest="levels", default=None,
                      metavar="LEVEL", action="append",
                      help=""" Level(s) to collect counts on. Use flag 
                      multiple times to specify multiple levels. If multiple 
                      values given, one table produced for each with rank 
                      name appended to file name. Levels can be an integer 
                      (1-3) for KEGG or SEED levels, any one of 'gene', 'role', 'family', 
                      'ko', or 'ortholog' (which are all synonyms), or  
                      anything not synonymous with 'gene' to 
                      get CAZy groups. Defaults to ortholog/role and 
                      levels 1, 2, and 3 for KEGG and SEED
                      and gene and group for CAZy and COG.""")
    parser.add_option('-s','--squash',dest='splitForLevels',
            default=True, action='store_false',
            help="Don't split assignment rows if gene maps to multiple pathways, just squash them into one row using python list syntax")

    # format, ortholog heirarchy, and more
    kegg.addPathOptions(parser)

    # log level and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    # Set defaults and check for some conflicts
    if options.levels is None and options.heirarchyFile is None:
        # using hit names only
        options.levels=[None]
    else:
        if options.heirarchyFile is None and options.heirarchyType != 'cazy':
            logging.warn("Type: %s" % (options.heirarchyType))
            parser.error("Cannot select levels without a heirarchy (ko) file")
        if options.levels is None:
            # set a default
            if options.heirarchyType is 'kegg':
                options.levels=['ko','1','2','pathway']
            if options.heirarchyType is 'seed':
                options.levels=['role','1','2','subsystem']
            else:
                options.levels=['gene','group']

        try:
            # Make sure the level list makes sense
            options.levels=cleanLevels(options.levels)
        except Exception as e:
            parser.error(str(e))

    # only print to stdout if there is a single input file
    if len(args)>1 and options.outfile is None:
        parser.error("STDOUT only works if a single input file is given!")


    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle == 'auto':
            with open(options.mapFile) as f:
                firstLine=f.next()
                while len(firstLine)==0 or firstLine[0]=='#':
                    firstLine=f.next()
            if koMapRE.search(firstLine):
                options.mapStyle='kegg'
            elif seedMapRE.search(firstLine):
                options.mapStyle='seed'
            elif tabMapRE.search(firstLine):
                options.mapStyle='tab'
            #elif cogMapRE.search(firstLine):
            #    options.mapStyle='cog'
            else:
                raise Exception("Cannot figure out map type from first line:\n%s" % (firstLine))

        logging.info("Map file seems to be: %s" % (options.mapStyle))
        if options.mapStyle=='kegg':
            valueMap=kegg.parseLinkFile(options.mapFile)
        elif options.mapStyle=='seed':
            valueMap=kegg.parseSeedMap(options.mapFile)
        #elif options.mapStyle=='cog':
        #    valueMap=kegg.parseCogMap(options.mapFile)
        else:
            if options.parseStyle == hits.GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
        if len(valueMap)>0:
            logging.info("Read %d items into map. EG: %s" % (len(valueMap),valueMap.iteritems().next()))
        else:
            logging.warn("Read 0 items into value map!")
    else:
        valueMap=None

    # set up level mapping
    levelMappers = [getLevelMapper(l,options) for l in options.levels]

    # parse input files
    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMapIter = hits.parseM8FileIter(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.mappedHitsOnly,sortReads=options.hitTableSortReads)

        outhandle.write("Read\t%s\n" % ('\t'.join(options.levels)))
        for read, hitIter in hitMapIter:
            assignments=[]
            for hit in hitIter:
                logging.debug("Hit: %s" % (hit))
                assignment=[]
                for levelMapper in levelMappers:
                    assignment.append(levelMapper(hit))
                assignments.append(assignment)
            logging.debug("Read %s has %d hits" % (read, len(assignments)))
            for assignment in assignments:
                for assignmentList in handleMultipleMappings(assignment,options):
                    outhandle.write("%s\t%s\n" % (read, "\t".join(assignmentList)))

Example #9

Show file

File: assignPathsToReadsFromBlast.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] INPUT_FILE(S)"
    description = """
Takes an m8 blast and assigns each read to a pathway or gene family. Blast may be specified with -i or piped to STDIN.
    """
    parser = OptionParser(usage, description=description)
    parser.add_option("-i", "--inputfile", dest="infile",
                      metavar="INFILE", help="Read data table from INFILE"),
    addIOOptions(parser)
    parser.add_option('-O', "--outputStyle", default="cols",
                      choices=['cols','lines','python'],
                      help="How are multiple assignments displayed in output. By default ('cols'), multiple hits show up in multiple columns. The 'lines' option prints out a new line for each assignment. The 'python' option prints each assignment as a python string (in quotes) or a list of strings (in quotes, separted by commas, surrounded bya  pair of sqaure brackets).")
    parser.add_option("-m", "--mapFile", dest="mapFile",
                      metavar="MAPFILE", help="Location of file containing table of with db hit name as first column and geneIDs (Knumber) in second column.")
    parser.add_option("-M", "--mapStyle", default='auto', choices=['auto','kegg','tab'],
                      help="What type of mapping file are you using: simple tab separated list of IDs and kos, or the genes_ko.list file from KEGG (which adds ko: to the K numbers and can have multiple records for each gene id). By default, this script will inspect the file name and guess, but you can force either 'kegg' or 'tab' with this option.")
    parser.add_option("-p", "--parseStyle",
                      default=KEGG,
                      choices=[ACCS,GIS,KEGG,HITID,HITDESC],
                      help="What should be parsed from the hit table: accessions('accs'), 'gis', K numbers in description ('kegg'), the full hit name('hitid'), or the full hit description('hitdesc'). (defaults to '%default')")
    parser.add_option("-c", "--cutoff", dest="cutoff", type="float", default=0.01,
            help="Cutoff for showing paths or genes. If a fractional count for a path/gene is below this value, it will be labelled None.",
                  metavar="CUTOFF")

    # format and filterPct
    addHitTableOptions(parser)

    parser.add_option("-C", "--countMethod", dest="countMethod", default="all", choices=('first','most','all','consensus'),
                      help="How to deal with assignments from multiple hits. (first, most: can return multiple hits, all (default): return every hit, consensus: return None unless all the same)",
                    metavar="COUNTMETHOD")
    parser.add_option("-r","--filterForKO",action="store_true", dest="koHitsOnly", default=False, help="ignore hits with no KO assignment. This means reads with no hits to KO tagged sequences will not be in the output.")
    parser.add_option("-l","--level", dest="level", default="ko", choices=('ko','NAME','DEFINITION','EC','PATHWAY','1','2','3'), help="Either 'ko'; a string to look for in ko file ('PATHWAY','NAME', 'DEFINITION', or 'EC'); or level in kegg class heirarchy (1, 2, or 3 (should be same as PATHWAY))")
    parser.add_option("-k", "--koFile", dest="ko", metavar="KOFILE", default=None,
                      help="File containing kegg heirarchy (either ko or ko00001.keg)")
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if options.infile is None:
        infile = sys.stdin
    else:
        infile = open(options.infile)

    if options.parseStyle==KEGG:
        if options.mapFile is not None:
            logging.warn("Do you REALLY want to apply a mapping to KOs?")

    if options.level != 'ko':
        if options.ko is None:
            options.error("Please supply KEGG file if sepcifying a level other than 'ko' ")

        # read KEGG file
        koTranslation = readKEGGFile(options.ko, options.level)
    else:
        koTranslation = None

    # map reads to hits
    if options.mapFile is not None:
        if options.mapStyle=='kegg' or ( options.mapStyle=='auto' and len(options.mapFile)>=13 and options.mapFile[-13:]=='genes_ko.list'):
            valueMap=parseLinkFile(options.mapFile)
        else:
            if options.parseStyle == GIS:
                keyType=int
            else:
                keyType=None
            valueMap = parseMapFile(options.mapFile,valueType=None,keyType=keyType)
    else:
        valueMap=None

    for (inhandle,outhandle) in inputIterator(args, options):
        logging.debug("Reading from %s and writing to %s" % (inhandle, outhandle))
        hitMap = parseM8File(inhandle, valueMap, options.hitTableFormat, options.filterTopPct, options.parseStyle, options.countMethod, ignoreEmptyHits=options.koHitsOnly,sortReads=options.hitTableSortReads)

        # manipulate mappings
        hitMap = applySimpleCutoff(hitMap, options.cutoff, koTranslation)

        log("maps complete for %d reads" % (len(hitMap)))

        # print out hit table
        outhandle.write("Read\tHit\n")
        if options.outputStyle=='python':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                outhandle.write("\t")
                outhandle.write(repr(hit))
                outhandle.write("\n")
        if options.outputStyle=='lines':
            for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write(str(read))
                        outhandle.write("\t")
                        outhandle.write(str(h))
                        outhandle.write("\n")
                else:
                    outhandle.write(str(read))
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                    outhandle.write("\n")
        else:
             for read in sorted(hitMap.keys()):
                hit=hitMap[read]
                outhandle.write(str(read))
                if type(hit) is type([]):
                    for h in sorted(hit):
                        outhandle.write("\t")
                        outhandle.write(str(h))
                else:
                    outhandle.write("\t")
                    outhandle.write(str(hit))
                outhandle.write("\n")

Example #10

Show file

File: countTaxaFromBlasts.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog [OPTIONS] BLAST_M8_FILE[S]"
    description = """
Takes m8 blast files and generates a table of taxon hit counts for the given rank. Columns are input files and rows are taxa. If multiple ranks given (the default), multiple output files are produced, each with the rank name appended to the output file name.
    """
    parser = OptionParser(usage, description=description)
    parser.add_option("-o", "--outfile", dest="outfile", metavar="OUTFILE", help="Write count table to OUTFILE")
    parser.add_option(
        "-r",
        "--rank",
        dest="ranks",
        default=None,
        metavar="RANK",
        action="append",
        help=""" Rank(s) to collect counts on. Use flag multiple
                      times to specify multiple ranks. If multiple values
                      given, one table produced for each with rank name
                      appended to file name. Defaults to all major ranks
                      between phylum and species. Corresponds to rank names 
                      in nodes.dmp. To see list run: 
                      'cut -f5 nodes.dmp | uniq | sort | uniq' 
                      in ncbi tax dir. Will also accept 'organism' to mean 
                      no rank (ie, just the organism name).""",
    )
    parser.add_option(
        "-s",
        "--collapseToDomain",
        default=False,
        action="store_true",
        help="Collapse all taxa below given rank down to superkingdom/domain. EG: in the genus output, anything assigned to Cyanobactia, will be lumped in with all other bacteria",
    )
    parser.add_option(
        "-R",
        "--printRank",
        dest="printRanks",
        action="append",
        help="Include indeicated rank(s) in lineage of printed taxa. Will be ignored if beyond the rank of the taxa (IE We can't include species if the taxon being counted is genus)",
    )

    # option for deconvoluting clusters or assemblies
    addWeightOption(parser, multiple=True)

    # cutoff options
    addCountOptions(parser)

    # format, tax dir, and more
    addTaxonOptions(parser, choices={"countMethod": ("LCA", "all", "first", "most", "tophit", "toporg", "consensus")})

    # log level and help
    addUniversalOptions(parser)

    (options, args) = parser.parse_args()

    setupLogging(options, description)

    if len(args) == 0:
        parser.error("Must supply at least one m8 file to parse")

    # Handle the case where Galaxy tries to set None as a string
    options.ranks = checkNoneOption(options.ranks)
    options.printRanks = checkNoneOption(options.printRanks)

    # Set defaults and check for some conflicts
    if options.ranks is None and options.taxdir is None:
        # using hit names only
        options.ranks = [ORG_RANK]
        if options.printRanks is not None:
            parser.error("Display ranks are not used without taxonomic info")
    else:
        if options.taxdir is None:
            parser.error("Cannot select ranks without a taxonomy")
        if options.ranks is None:
            # set a default
            options.ranks = ["phylum", "class", "order", "family", "genus", "species"]

        try:
            # Make sure the rank lists make sense
            options.ranks = cleanRanks(options.ranks)
            if options.printRanks is not None:
                options.printRanks = cleanRanks(options.printRanks)
        except Exception as e:
            parser.error(str(e))

    # load weights file
    sequenceWeights = loadSequenceWeights(options.weights)

    # only print to stdout if there is a single rank
    if len(options.ranks) > 1 and options.outfile is None:
        parser.error("STDOUT only works if a single rank is chosen!")

    cutoff = options.cutoff

    # Because rank is used in parsing hits, we can only do multiple ranks for
    # certain kinds of count methods
    if len(options.ranks) > 1:
        rank = None
        if options.countMethod in ["consensus", "most"]:
            parser.error(
                "Using multiple ranks does not work with the 'consensus' or 'most' counting methods. LCA should give the same results as consensus. If you really want to do this, us a bash loop:'for rank in phylum order genus; do COMMAND -r ${rank}; done'"
            )
    else:
        rank = options.ranks[0]

    # load necessary maps
    (taxonomy, hitStringMap) = readMaps(options)

    # parse input files
    fileCounts = {}
    totals = {}
    fileLabels = {}
    sortedLabels = []

    # Allow for file names to be preceded with TAG=
    for filename in args:
        bits = filename.split("=", 1)
        if len(bits) > 1:
            (filetag, filename) = bits
        else:
            filetag = filename
        fileLabels[filename] = filetag
        # keep order so that column order matches arguments
        sortedLabels.append(filetag)
        fileCounts[filetag] = {}
        totals[filetag] = 0

    if options.countMethod == "tophit" or options.countMethod == "toporg":
        # Process all files at once and use overall abundance to pick best hits
        from edl import redistribute

        params = FilterParams.createFromOptions(options)
        (multifile, readFileDict) = redistribute.multipleFileWrapper(fileLabels.keys(), params, returnLines=True)

        if options.countMethod == "tophit":
            # don't give any taxonomy, just map to accessions for redistribution
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                winnerTakeAll=True,
                parseStyle=options.parseStyle,
                sequenceWeights=sequenceWeights,
            )
            # define method to turn Hits into orgnaisms
            hitTranslator = getHitTranslator(
                parseStyle=options.parseStyle, taxonomy=taxonomy, hitStringMap=hitStringMap
            )
            translateHit = lambda hit: hitTranslator.translateHit(hit)[0]
        else:
            # translate to organism before finding most abundant
            readHits = redistribute.pickBestHitByAbundance(
                multifile,
                filterParams=params,
                returnLines=False,
                returnTranslations=True,
                winnerTakeAll=True,
                taxonomy=taxonomy,
                hitStringMap=hitStringMap,
                parseStyle=hits.ACCS,
            )
            # Organisms will be returned, make translator trivial:
            translateHit = lambda hit: hit

        # use read->file mapping and hit translator to get file based counts
        #  from returned (read,Hit) pairs
        increment = 1
        for (read, hit) in readHits:
            filename = readFileDict[read]
            filetag = fileLabels[filename]
            taxon = translateHit(hit)
            taxcount = fileCounts[filetag].setdefault(taxon, 0)
            if sequenceWeights is not None:
                increment = sequenceWeights.get(read, 1)
            fileCounts[filetag][taxon] = taxcount + increment
            totals[filetag] += increment
        logging.debug(str(totals))

    else:
        # Original way, just process each file separately
        for (filename, filetag) in fileLabels.iteritems():
            infile = open(filename, "rU")

            hitIter = parseM8FileIter(
                infile,
                hitStringMap,
                options.hitTableFormat,
                options.filterTopPct,
                options.parseStyle,
                options.countMethod,
                taxonomy=taxonomy,
                rank=rank,
                sortReads=options.hitTableSortReads,
            )

            (total, counts, hitMap) = countIterHits(hitIter, allMethod=options.allMethod, weights=sequenceWeights)
            fileCounts[filetag] = counts
            totals[filetag] = total

            logging.info(
                "parsed %d hits (%d unique) for %d reads from %s" % (total, len(counts), len(hitMap), filename)
            )

            infile.close()

    printCountTablesByRank(fileCounts, totals, sortedLabels, options)

Example #11

Show file

File: sunburstFromJSON.py Project: JessAwBryant/py-metagenomics

def main():
    usage = "usage: %prog OPTIONS JSON_FILE(s)"
    description = """
    Generates a sunburst plot for each input JSON tree.
    """
    parser = OptionParser(usage, description=description)
    addIOOptions(parser)
    addUniversalOptions(parser)

    parser.add_option('-r', "--root", default=None, help="Plot a subset of the tree by choosing a root node for the subtree")
    parser.add_option('-c', "--colors", default=None, help="Set colors by mapping node IDs to color strings. Value should be a comma-separated list of id=color pairs (Bacteria=g,Archaea=r). The subtree of each mapped node will get the given color unless overridden by another entry. If omitted, colors pulled from JSON (using colorkey) with red as the default. If present without --colorkey setting, colors in JSON will be ignored.")
    parser.add_option('-s','--sort', default=None,
                      help="List of keys to sort on for plotting, NOTE: sorting on the value key will give suprising results for lower level nodes as sum of nested values will not be included. To get desired behavior, add a total value key to your tree and sort on that.")

    parser.add_option('-I','--idkey', default='name', help="String to use as key for node IDs. Default: %default")
    parser.add_option('-L','--labelkey', default='name', help="String to use as key for node labels. Default: %default")
    parser.add_option('-C','--colorkey', default='color', help="String to use as key for node colors. Default: %default")
    parser.add_option('-V','--valuekey', default='size', help="String to use as key for node sizes. Default: %default")
    parser.add_option('-K','--kidskey', default='children', help="String to use as key for list of child nodes. Default: %default")

    parser.add_option('-i', '--icicle', default=False, action='store_true',
                      help="Print stacked bars in rectangular coordinates, not polar.")
    parser.add_option('-e', '--exterior_labels', default=False, action='store_true', help="Print labels for outermost nodes outside image")
    parser.add_option('-S', '--figsize', default=None,
                      help="Comma separated pair of numbers (in inches) for figure size")

    parser.add_option("-f", "--format", dest="format", default='pdf', choices=['png','ps','pdf','svg'],
		  help="Format for output image", metavar="FORMAT")

    (options, args) = parser.parse_args()

    # check arguments
    setupLogging(options, description)

    # setup matplotlib
    backend = options.format
    if backend=='png':
        backend='agg'
    matplotlib.use(backend)
    import matplotlib.pyplot as plt

    for (inhandle, outfile) in inputIterator(args, options):
        # import JSON
        tree=json.load(inhandle)

        # proecss user selected options
        kwargs=processOptions(options)

        # process JSON
        if options.colors is not None:
            setColors(tree, options.colors, **kwargs)
        if options.root is not None:
            newRoot=findNode(tree, options.root, **kwargs)
            if newRoot is not None:
                tree=newRoot

        # some of the matplotlib functions don't like extra arguments
        kwargs.pop(ID)

        # create figure
        plotSunburstJSON(tree,**kwargs)

        # save to file
        plt.savefig(outfile)