def testGetTaxonomy(self): """ Test if the LineageFetcher class works properly. """ title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus' db = FakeDbConnection([ [15], ['Merkel cell polyomavirus'], [4], ['Polyomavirus'], [3], ['dsDNA viruses'], [2], ['Vira'], [1], ]) cursor = db.cursor() lineageFetcher = LineageFetcher(db=db, cursor=cursor) lineage = lineageFetcher.lineage(title) self.assertEqual([ (15, 'Merkel cell polyomavirus'), (4, 'Polyomavirus'), (3, 'dsDNA viruses'), (2, 'Vira'), ], lineage)
def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, percentageIdenticalCutoff=None, percentagePositiveCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff self.percentageIdenticalCutoff = percentageIdenticalCutoff self.percentagePositiveCutoff = percentagePositiveCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
e, file=sys.stderr) sys.exit(1) if args.detailsFile is not None: detailsFp = open(args.detailsFile, 'w') def details(accept, readId, taxonomy): return writeDetails(accept, readId, taxonomy, detailsFp) else: detailsFp = None def details(accept, readId, taxonomy): return None lineageFetcher = LineageFetcher() reads = FastaReads(sys.stdin) save = sys.stdout.write readCount = saveCount = noTaxonomyCount = 0 for read in reads: readCount += 1 fasta = read.toString('fasta') taxonomy = lineageFetcher.lineage(read.id) if taxonomy: for taxonomyId, scientificName in taxonomy: if regexp.match(scientificName): details(True, read.id, taxonomy) if not args.invert: saveCount += 1 save(fasta)