def countHits(infile, **kwargs): """ Count hits from a hit table. Calls edl.hits.parseM8FileIter with the following optional parameters: hitStringMap (None): dictionary (or file) mapping hit IDs to something else format (GENE): hit table format filter_top_pct (0): only consider hits within this % of top score for each read parseStyle (ACCS): how to process hit data into an identifying string countMethod ('all'): how to resolve hits to multiple sequences taxonomy (None): An edl.taxon.Taxonomy object or directory conatining taxdmp rank (None): Maximum rank to resolve hits """ # if taxonomy or hitStringMap are file names, parse them taxonomy = kwargs.pop('taxonomy', None) if isinstance(taxonomy, str): taxonomy = readTaxonomy(taxonomy, namesMap=kwargs.pop('namesMap', False)) hitStringMap = kwargs.pop('hitStringMap', None) if isinstance(hitStringMap, str): if taxonomy is not None: # the mapped hit ids will need to be ints valueType = kwargs.pop('valueType', int) else: valueType = kwargs.pop('valueType', None) hitStringMap = parseMapFile(hitStringMap, valueType=valueType) # if infile is name (and not handle), open as a handle if isinstance(infile, str): inhandle = open(infile) else: inhandle = infile # get iterator over reads that will parse hits hitIter = parseM8FileIter(inhandle, hitStringMap, FilterParams( format=kwargs.pop('format', GENE), top_pct=kwargs.pop('filter_top_pct', 0), ), kwargs.pop('parseStyle', ACCS), kwargs.pop('countMethod', 'all'), taxonomy=taxonomy, rank=kwargs.pop('rank', None)) # count the hits (total, counts) = countIterHits(hitIter, allMethod=kwargs.pop('allMethod', ALLEQ), returnMap=False) logger.info("Total hits: %s" % total) if isinstance(infile, str): inhandle.close() return counts
def parseM8FileIter( inhandle, hitStringMap, format, scorePct, parsingStyle, countMethod, taxonomy=None, rank=None, ignoreEmptyHits=True, sortReads=False): """ Wrapper method that combines filterM8, parseHits, and process hits to: filter hits using format and scorePct map reads to hits using parseHits translate hits using processHits If taxonomy is not None, hits will be TaxNode objects contMethod can only be LCA if taxonomy given Return an iterator over (read,hits) tuples. """ # check filtering options if countMethod == 'first': scorePct = -1 # get map from reads to lists of hit strings logger.info("Parsing hits") options = FilterParams() options.format = format if scorePct >= 0 or sortReads: # filter hits on score if requested if scorePct >= 0: logger.info( "Filtering for scores within %s pct of best" % scorePct) options.topPct = scorePct options.sort = 'score' options.sortReads = sortReads # filters and parses options.parseStyle = parsingStyle hitIter = filterM8Stream(inhandle, options, returnLines=False) # apply org or acc translation # apply map of hit names if given' # look up taxon node hitIter = processHits( hitIter, hitStringMap=hitStringMap, parseStyle=parsingStyle, taxonomy=taxonomy, rank=rank) # apply count method hitIter = applyCountMethod(hitIter, countMethod, ignoreEmptyHits) return hitIter
def plotHitStats(axes, sequenceFile, hitsFile, referenceLengths=None, sequenceFormat='fasta', bins=20, hlog=False, lengthRange=None, barcolor='b', baredgecolor='k', hcolor='r', params=None, **kwargs): """ Given two or three matplotlib.axes.AxesSubplot objects create plot in each binned by sequence length: * overlay a histogram of sequence lengths on the fraction of sequences in each bin that have a hit * same bins as above, but use total sequence bases on top of fraction of bases covered by hits * if fasta or lengths of reference hits given, plot (using same bins) fraction of reference bases used in hits Positional Arguments: * axes: length 2 list or tuple of ax objects * sequenceFile: fasta or similar file of sequence data * hitsFile: text hit table Parameters: * hit parsing * params=None edl.blatm8.FilterParams object to filter hits * **kwargs used to create FilterParams object if params object not given * sequence parsing * sequenceFormat='fasta'. Can be anything supported by BioPython * referenceLengths=None: if give, create 3rd plot using given dictionary of hits. It can also just be the fasta of the reference sequences and the code will look up the lengths. * plotting: * bins=20 Number of length bins to divide sequence data into * barcolor='b' Color of data bars * baredgecolor='k' Color of data bar edges * hcolor='r' Color of histogram line and axis labels * lengthRange=None Can be used to force the x axis to span a specific range * hlog=False If set to True, histogram data plotted in log scale """ # get sequence lengths lengths = getSequenceLengths(sequenceFile, format=sequenceFormat) # parse hit file if params is None: params = FilterParams(**kwargs) hits = getSequenceHits(hitsFile, params) # plot data plotTranscriptHitRateByLengthBins(axes[0], lengths, hits, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog) plotTranscriptCoverageByLengthBins(axes[1], lengths, hits, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog) if referenceLengths is not None: plotHitCoverageByLengthBins(axes[2], lengths, hits, referenceLengths, bins=bins, lengthRange=lengthRange, barcolor=barcolor, baredgecolor=baredgecolor, hcolor=hcolor, hlog=hlog)