コード例 #1
0
ファイル: hittables.py プロジェクト: jmeppley/py-metagenomics
def countHits(infile, **kwargs):
    """
    Count hits from a hit table.

    Calls edl.hits.parseM8FileIter with the following optional parameters:
        hitStringMap (None): dictionary (or file) mapping hit IDs to
        something else
        format (GENE): hit table format
        filter_top_pct (0): only consider hits within this % of top score for
        each read
        parseStyle (ACCS): how to process hit data into an identifying string
        countMethod ('all'): how to resolve hits to multiple sequences
        taxonomy (None): An edl.taxon.Taxonomy object or directory
        conatining taxdmp
        rank (None): Maximum rank to resolve hits
    """

    # if taxonomy or hitStringMap are file names, parse them
    taxonomy = kwargs.pop('taxonomy', None)
    if isinstance(taxonomy, str):
        taxonomy = readTaxonomy(taxonomy,
                                namesMap=kwargs.pop('namesMap', False))
    hitStringMap = kwargs.pop('hitStringMap', None)
    if isinstance(hitStringMap, str):
        if taxonomy is not None:
            # the mapped hit ids will need to be ints
            valueType = kwargs.pop('valueType', int)
        else:
            valueType = kwargs.pop('valueType', None)
        hitStringMap = parseMapFile(hitStringMap, valueType=valueType)

    # if infile is name (and not handle), open as a handle
    if isinstance(infile, str):
        inhandle = open(infile)
    else:
        inhandle = infile

    # get iterator over reads that will parse hits
    hitIter = parseM8FileIter(inhandle,
                              hitStringMap,
                              FilterParams(
                                  format=kwargs.pop('format', GENE),
                                  top_pct=kwargs.pop('filter_top_pct', 0),
                              ),
                              kwargs.pop('parseStyle', ACCS),
                              kwargs.pop('countMethod', 'all'),
                              taxonomy=taxonomy,
                              rank=kwargs.pop('rank', None))

    # count the hits
    (total, counts) = countIterHits(hitIter,
                                    allMethod=kwargs.pop('allMethod', ALLEQ),
                                    returnMap=False)

    logger.info("Total hits: %s" % total)
    if isinstance(infile, str):
        inhandle.close()

    return counts
コード例 #2
0
ファイル: hits.py プロジェクト: Piplopp/py-metagenomics
def parseM8FileIter(
        inhandle,
        hitStringMap,
        format,
        scorePct,
        parsingStyle,
        countMethod,
        taxonomy=None,
        rank=None,
        ignoreEmptyHits=True,
        sortReads=False):
    """
    Wrapper method that combines filterM8, parseHits, and process hits to:
        filter hits using format and scorePct
        map reads to hits using parseHits
        translate hits using processHits

    If taxonomy is not None, hits will be TaxNode objects
    contMethod can only be LCA if taxonomy given

    Return an iterator over (read,hits) tuples.
    """

    # check filtering options
    if countMethod == 'first':
        scorePct = -1

    # get map from reads to lists of hit strings
    logger.info("Parsing hits")
    options = FilterParams()
    options.format = format
    if scorePct >= 0 or sortReads:
        # filter hits on score if requested
        if scorePct >= 0:
            logger.info(
                "Filtering for scores within %s pct of best" %
                scorePct)
            options.topPct = scorePct
            options.sort = 'score'
        options.sortReads = sortReads
        # filters and parses
    options.parseStyle = parsingStyle
    hitIter = filterM8Stream(inhandle, options, returnLines=False)

    # apply org or acc translation
    # apply map of hit names if given'
    # look up taxon node
    hitIter = processHits(
        hitIter,
        hitStringMap=hitStringMap,
        parseStyle=parsingStyle,
        taxonomy=taxonomy,
        rank=rank)

    # apply count method
    hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits)

    return hitIter
コード例 #3
0
ファイル: assembly.py プロジェクト: jmeppley/py-metagenomics
def plotHitStats(axes,
                 sequenceFile,
                 hitsFile,
                 referenceLengths=None,
                 sequenceFormat='fasta',
                 bins=20,
                 hlog=False,
                 lengthRange=None,
                 barcolor='b',
                 baredgecolor='k',
                 hcolor='r',
                 params=None,
                 **kwargs):
    """
    Given two or three matplotlib.axes.AxesSubplot objects create plot in
    each binned by sequence length:

     * overlay a histogram of sequence lengths on the fraction of sequences
       in each bin that have a hit
     * same bins as above, but use total sequence bases on top of fraction
       of bases covered by hits
     * if fasta or lengths of reference hits given, plot (using same bins)
       fraction of reference bases used in hits

    Positional Arguments:
     * axes: length 2 list or tuple of ax objects
     * sequenceFile: fasta or similar file of sequence data
     * hitsFile: text hit table

    Parameters:
     * hit parsing
      * params=None edl.blatm8.FilterParams object to filter hits
      * **kwargs used to create FilterParams object if params object not given
     * sequence parsing
      * sequenceFormat='fasta'. Can be anything supported by BioPython
      * referenceLengths=None: if give, create 3rd plot using given
        dictionary of hits. It can also just be the fasta of the reference
        sequences and the code will look up the lengths.
     * plotting:
      * bins=20 Number of length bins to divide sequence data into
      * barcolor='b' Color of data bars
      * baredgecolor='k' Color of data bar edges
      * hcolor='r' Color of histogram line and axis labels
      * lengthRange=None Can be used to force the x axis to span a
        specific range
      * hlog=False If set to True, histogram data plotted in log scale
    """

    # get sequence lengths
    lengths = getSequenceLengths(sequenceFile, format=sequenceFormat)

    # parse hit file
    if params is None:
        params = FilterParams(**kwargs)
    hits = getSequenceHits(hitsFile, params)

    # plot data
    plotTranscriptHitRateByLengthBins(axes[0],
                                      lengths,
                                      hits,
                                      bins=bins,
                                      lengthRange=lengthRange,
                                      barcolor=barcolor,
                                      baredgecolor=baredgecolor,
                                      hcolor=hcolor,
                                      hlog=hlog)
    plotTranscriptCoverageByLengthBins(axes[1],
                                       lengths,
                                       hits,
                                       bins=bins,
                                       lengthRange=lengthRange,
                                       barcolor=barcolor,
                                       baredgecolor=baredgecolor,
                                       hcolor=hcolor,
                                       hlog=hlog)
    if referenceLengths is not None:
        plotHitCoverageByLengthBins(axes[2],
                                    lengths,
                                    hits,
                                    referenceLengths,
                                    bins=bins,
                                    lengthRange=lengthRange,
                                    barcolor=barcolor,
                                    baredgecolor=baredgecolor,
                                    hcolor=hcolor,
                                    hlog=hlog)