def testGetTaxonomy(self): """ Test if the LineageFetcher class works properly. """ title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus' db = FakeDbConnection([ [15], ['Merkel cell polyomavirus'], [4], ['Polyomavirus'], [3], ['dsDNA viruses'], [2], ['Vira'], [1], ]) cursor = db.cursor() lineageFetcher = LineageFetcher(db=db, cursor=cursor) lineage = lineageFetcher.lineage(title) self.assertEqual([ (15, 'Merkel cell polyomavirus'), (4, 'Polyomavirus'), (3, 'dsDNA viruses'), (2, 'Vira'), ], lineage)
def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, percentageIdenticalCutoff=None, percentagePositiveCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff self.percentageIdenticalCutoff = percentageIdenticalCutoff self.percentagePositiveCutoff = percentagePositiveCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0
def testGetTaxonomy(self): """ Test if the LineageFetcher class works properly. """ title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus' db = FakeDbConnection([ [15], ['Merkel cell polyomavirus'], [4], ['Polyomavirus'], [3], ['dsDNA viruses'], [2], ['Vira'], [1], ]) cursor = db.cursor() lineageFetcher = LineageFetcher(db=db, cursor=cursor) lineage = lineageFetcher.lineage(title) self.assertEqual( [ (15, 'Merkel cell polyomavirus'), (4, 'Polyomavirus'), (3, 'dsDNA viruses'), (2, 'Vira'), ], lineage)
def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0
print('Could not compile %r to a regular expression:' % args.taxonomy, e, file=sys.stderr) sys.exit(1) if args.detailsFile is not None: detailsFp = open(args.detailsFile, 'w') def details(accept, readId, taxonomy): return writeDetails(accept, readId, taxonomy, detailsFp) else: detailsFp = None def details(accept, readId, taxonomy): return None lineageFetcher = LineageFetcher() reads = FastaReads(sys.stdin) save = sys.stdout.write readCount = saveCount = noTaxonomyCount = 0 for read in reads: readCount += 1 fasta = read.toString('fasta') taxonomy = lineageFetcher.lineage(read.id) if taxonomy: for taxonomyId, scientificName in taxonomy: if regexp.match(scientificName): details(True, read.id, taxonomy) if not args.invert: saveCount += 1 save(fasta)
class ReadsAlignmentsFilter(object): """ Provide a filter for C{ReadsAlignments} instances. @param limit: An C{int} limit on the number of records to read. @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a read may have in order not to be filtered out. Reads with a greater number of alignments will be elided. Pass 0 to filter out reads that did not match (i.e., align to) any subjects. Use C{None} for no max alignments filtering. @param minSequenceLen: Sequences of lesser length will be elided. @param maxSequenceLen: Sequences of greater length will be elided. @param minStart: HSPs that start before this offset in the matched sequence should not be returned. @param maxStop: HSPs that end after this offset in the matched sequence should not be returned. @param oneAlignmentPerRead: If C{True}, only keep the best alignment for each read. @param maxHspsPerHit: The maximum number of HSPs to keep for each alignment for each read. @param scoreCutoff: A C{float} score. Matches with scores that are not better than this score will be ignored. @param whitelist: If not C{None}, a set of exact titles that are always acceptable (though the match info for a whitelist title may rule it out for other reasons). @param blacklist: If not C{None}, a set of exact titles that are never acceptable. @param titleRegex: A regex that sequence titles must match. @param negativeTitleRegex: A regex that sequence titles must not match. @param truncateTitlesAfter: A string that titles will be truncated beyond. If a truncated title has already been seen, that title will be elided. @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic group on which should be filtered. eg 'Vira' will filter on viruses, while 11118 will filter on Coronaviridae. @param readIdRegex: A case-sensitive regex C{str} that read ids must match. @return: C{self}. """ def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, whitelist=None, blacklist=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0 def filter(self, readAlignments): """ Filter a read's alignments. @param readAlignments: A C{ReadAlignments} instance. @return: A C{ReadAlignments} instance if the passed C{readAlignments} is not filtered out, else C{False}. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the desired filtering # settings on each call the way the current code does. # # 3. A better approach with readIdRegex would be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # if self.limit is not None and self.count == self.limit: return False # Does the read have too many alignments? if (self.maxAlignmentsPerRead is not None and len(readAlignments) > self.maxAlignmentsPerRead): return False # Filter on the read id. if (self.readIdRegex and self.readIdRegex.search(readAlignments.read.id) is None): return False if self.titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (self.titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Only return alignments that are against sequences of the # desired length. minSequenceLen = self.minSequenceLen maxSequenceLen = self.maxSequenceLen if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > self.maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = self.lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if self.taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if self.maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > self.maxHspsPerHit: alignment.hsps = hsps[:self.maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if self.scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(self.scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Throw out HSPs that don't match in the desired place on the # matched sequence. minStart = self.minStart maxStop = self.maxStop if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False self.count += 1 return readAlignments def close(self): """ Close our lineage fetcher, if any. """ if self.taxonomy: self.lineageFetcher.close()
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter( whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
class ReadsAlignmentsFilter(object): """ Provide a filter for C{ReadsAlignments} instances. @param limit: An C{int} limit on the number of records to read. @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a read may have in order not to be filtered out. Reads with a greater number of alignments will be elided. Pass 0 to filter out reads that did not match (i.e., align to) any subjects. Use C{None} for no max alignments filtering. @param minSequenceLen: Sequences of lesser length will be elided. @param maxSequenceLen: Sequences of greater length will be elided. @param minStart: HSPs that start before this offset in the matched sequence should not be returned. @param maxStop: HSPs that end after this offset in the matched sequence should not be returned. @param oneAlignmentPerRead: If C{True}, only keep the best alignment for each read. @param maxHspsPerHit: The maximum number of HSPs to keep for each alignment for each read. @param scoreCutoff: A C{float} score. Matches with scores that are not better than this score will be ignored. @param whitelist: If not C{None}, a set of exact titles that are always acceptable (though the match info for a whitelist title may rule it out for other reasons). @param blacklist: If not C{None}, a set of exact titles that are never acceptable. @param whitelistFile: If not C{None}, a C{str} filename containing lines that give exact ids that are always acceptable. @param blacklistFile: If not C{None}, a C{str} filename containing lines that give exact ids that are never acceptable. @param titleRegex: A regex that sequence titles must match. @param negativeTitleRegex: A regex that sequence titles must not match. @param truncateTitlesAfter: A string that titles will be truncated beyond. If a truncated title has already been seen, that title will be elided. @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic group on which should be filtered. eg 'Vira' will filter on viruses, while 11118 will filter on Coronaviridae. @param readIdRegex: A case-sensitive regex C{str} that read ids must match. @return: C{self}. """ def __init__(self, limit=None, maxAlignmentsPerRead=None, minSequenceLen=None, maxSequenceLen=None, minStart=None, maxStop=None, oneAlignmentPerRead=False, maxHspsPerHit=None, scoreCutoff=None, whitelist=None, blacklist=None, whitelistFile=None, blacklistFile=None, titleRegex=None, negativeTitleRegex=None, truncateTitlesAfter=None, taxonomy=None, readIdRegex=None): self.limit = limit self.maxAlignmentsPerRead = maxAlignmentsPerRead self.minSequenceLen = minSequenceLen self.maxSequenceLen = maxSequenceLen self.minStart = minStart self.maxStop = maxStop self.oneAlignmentPerRead = oneAlignmentPerRead self.maxHspsPerHit = maxHspsPerHit self.scoreCutoff = scoreCutoff # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or whitelistFile or blacklistFile or titleRegex or negativeTitleRegex or truncateTitlesAfter): self.titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, whitelistFile=whitelistFile, blacklistFile=blacklistFile, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: self.titleFilter = None if taxonomy is not None: self.lineageFetcher = LineageFetcher() else: self.lineageFetcher = None self.taxonomy = taxonomy if readIdRegex is None: self.readIdRegex = None else: self.readIdRegex = re.compile(readIdRegex) self.count = 0 def filter(self, readAlignments): """ Filter a read's alignments. @param readAlignments: A C{ReadAlignments} instance. @return: A C{ReadAlignments} instance if the passed C{readAlignments} is not filtered out, else C{False}. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the desired filtering # settings on each call the way the current code does. # # 3. A better approach with readIdRegex would be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # if self.limit is not None and self.count == self.limit: return False # Does the read have too many alignments? if (self.maxAlignmentsPerRead is not None and len(readAlignments) > self.maxAlignmentsPerRead): return False # Filter on the read id. if (self.readIdRegex and self.readIdRegex.search(readAlignments.read.id) is None): return False if self.titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (self.titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Only return alignments that are against sequences of the # desired length. minSequenceLen = self.minSequenceLen maxSequenceLen = self.maxSequenceLen if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ( (minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > self.maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = self.lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if self.taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False if self.oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if self.maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > self.maxHspsPerHit: alignment.hsps = hsps[:self.maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if self.scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(self.scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False # Throw out HSPs that don't match in the desired place on the # matched sequence. minStart = self.minStart maxStop = self.maxStop if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: return False self.count += 1 return readAlignments def close(self): """ Close our lineage fetcher, if any. """ if self.taxonomy: self.lineageFetcher.close()
def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop, oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist, blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex): """ Filter the read alignments in self. Do not call this function directly, instead use self.filter (above). Argument defaults and descriptions (other than for iteratorIndex) are as in self.filter. @param iteratorIndex: An index into self._iterators. Calling the iterator function will return a generator that yields C{ReadAlignments} instances. @return: A generator that yields C{ReadAlignments} instances. """ # Implementation notes: # # 1. The order in which we carry out the filtering actions can make # a big difference in the result of this function. The current # ordering is based on what seems reasonable - it may not be the # best way to do things. E.g., if maxHspsPerHit is 1 and there # is a title regex, which should we perform first? # # We perform filtering based on alignment before that based on # HSPs. That's because there's no point filtering all HSPs for # an alignment that we end up throwing away anyhow. # # 2. This function could be made faster if it first looked at its # arguments and dynamically created an acceptance function # (taking a readAlignments as an argument). The acceptance # function would run without examining the above arguments for # each match the way the current code does. # # 3. A better approach with readIdRegex might be to allow the # passing of a regex object. Then the caller would make the # regex with whatever flags they liked (e.g., case insensitive). # # Alignment-only (i.e., non-HSP based) filtering. # # If we've been asked to filter on matched sequence titles in any way, # build a title filter. if (whitelist or blacklist or titleRegex or negativeTitleRegex or truncateTitlesAfter): titleFilter = TitleFilter(whitelist=whitelist, blacklist=blacklist, positiveRegex=titleRegex, negativeRegex=negativeTitleRegex, truncateAfter=truncateTitlesAfter) else: titleFilter = None if taxonomy is not None: lineageFetcher = LineageFetcher() if readIdRegex is not None: readIdRegex = re.compile(readIdRegex) count = 0 for readAlignments in self._iterators[iteratorIndex](): if limit is not None and count == limit: return # Filter on the read id. if (readIdRegex and readIdRegex.search(readAlignments.read.id) is None): continue if titleFilter: # Remove alignments against sequences whose titles are # unacceptable. wantedAlignments = [] for alignment in readAlignments: if (titleFilter.accept(alignment.subjectTitle) != TitleFilter.REJECT): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Only return alignments that are against sequences of the # desired length. if minSequenceLen is not None or maxSequenceLen is not None: wantedAlignments = [] for alignment in readAlignments: length = alignment.subjectLength if not ((minSequenceLen is not None and length < minSequenceLen) or (maxSequenceLen is not None and length > maxSequenceLen)): wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if taxonomy is not None: wantedAlignments = [] for alignment in readAlignments: lineage = lineageFetcher.lineage(alignment.subjectTitle) if lineage: for taxonomyIdAndScientificName in lineage: if taxonomy in taxonomyIdAndScientificName: wantedAlignments.append(alignment) else: # No lineage info was found. Keep the alignment # since we can't rule it out. We could add another # option to control this. wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue if oneAlignmentPerRead and readAlignments: readAlignments[:] = [bestAlignment(readAlignments)] # # From here on we do only HSP-based filtering. # # Throw out any unwanted HSPs due to maxHspsPerHit. if maxHspsPerHit is not None: for alignment in readAlignments: hsps = alignment.hsps if len(hsps) > maxHspsPerHit: alignment.hsps = hsps[:maxHspsPerHit] # Throw out HSPs whose scores are not good enough. if scoreCutoff is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if hsp.betterThan(scoreCutoff): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue # Throw out HSPs that don't match in the desired place on the # matched sequence. if minStart is not None or maxStop is not None: wantedAlignments = [] for alignment in readAlignments: hsps = alignment.hsps wantedHsps = [] for hsp in hsps: if not ((minStart is not None and hsp.readStartInSubject < minStart) or (maxStop is not None and hsp.readEndInSubject > maxStop)): wantedHsps.append(hsp) if wantedHsps: alignment.hsps = wantedHsps wantedAlignments.append(alignment) if wantedAlignments: readAlignments[:] = wantedAlignments else: continue yield readAlignments count += 1 if taxonomy: lineageFetcher.close()
e, file=sys.stderr) sys.exit(1) if args.detailsFile is not None: detailsFp = open(args.detailsFile, 'w') def details(accept, readId, taxonomy): return writeDetails(accept, readId, taxonomy, detailsFp) else: detailsFp = None def details(accept, readId, taxonomy): return None lineageFetcher = LineageFetcher() reads = FastaReads(sys.stdin) save = sys.stdout.write readCount = saveCount = noTaxonomyCount = 0 for read in reads: readCount += 1 fasta = read.toString('fasta') taxonomy = lineageFetcher.lineage(read.id) if taxonomy: for taxonomyId, scientificName in taxonomy: if regexp.match(scientificName): details(True, read.id, taxonomy) if not args.invert: saveCount += 1 save(fasta)