Beispiel #1
0
 def testBlacklist(self):
     """
     Testing for acceptance against a title filter with a blacklist
     must work.
     """
     tf = TitleFilter(blacklist=['never ok'], positiveRegex='ok')
     self.assertEqual(TitleFilter.REJECT, tf.accept('never ok'))
Beispiel #2
0
 def testNoRestriction(self):
     """
     Testing for acceptance against a title filter that has no
     restrictions should return C{TitleFilter.DEFAULT_ACCEPT}.
     """
     tf = TitleFilter()
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey'))
 def testBlacklist(self):
     """
     Testing for acceptance against a title filter with a blacklist
     must work.
     """
     tf = TitleFilter(blacklist=["never ok"], positiveRegex="ok")
     self.assertEqual(TitleFilter.REJECT, tf.accept("never ok"))
Beispiel #4
0
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 percentageIdenticalCutoff=None,
                 percentagePositiveCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff
        self.percentageIdenticalCutoff = percentageIdenticalCutoff
        self.percentagePositiveCutoff = percentagePositiveCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
 def testNoRestriction(self):
     """
     Testing for acceptance against a title filter that has no
     restrictions should return C{TitleFilter.DEFAULT_ACCEPT}.
     """
     tf = TitleFilter()
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("hey"))
Beispiel #6
0
    def __init__(self, assetDir='out', sampleName=None, sampleNameRegex=None,
                 format_='fasta', proteinFastaFilenames=None,
                 saveReadLengths=False, titleRegex=None,
                 negativeTitleRegex=None, pathogenDataDir='pathogen-data'):
        self._assetDir = assetDir
        self._sampleName = sampleName
        self._sampleNameRegex = (re.compile(sampleNameRegex) if sampleNameRegex
                                 else None)
        if format_ in ('fasta', 'fastq'):
            self._format = format_
        else:
            raise ValueError("format_ must be either 'fasta' or 'fastq'.")
        self._saveReadLengths = saveReadLengths

        if titleRegex or negativeTitleRegex:
            self.titleFilter = TitleFilter(
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex)
        else:
            self.titleFilter = None

        self._pathogenDataDir = pathogenDataDir

        self._pathogenProteinCount = getPathogenProteinCounts(
            proteinFastaFilenames)

        # pathogenNames will be a dict of dicts of dicts. The first two keys
        # will be a pathogen name and a sample name. The final dict will
        # contain 'proteins' (a list of dicts) and 'uniqueReadCount' (an int).
        self.pathogenNames = {}
        # sampleNames is keyed by sample name and will have values that hold
        # the sample's alignment panel index.html file.
        self.sampleNames = {}
        self.pathogenSampleFiles = PathogenSampleFiles(self, format_=format_)
Beispiel #7
0
 def testPositiveRegex(self):
     """
     Testing for acceptance against a title filter with a positive regex
     must work.
     """
     tf = TitleFilter(positiveRegex=r'x+\s')
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxx you'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxyou'))
Beispiel #8
0
 def testWhitelistTakesPrecedenceOverBlacklist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a blacklist that contain the same title must work (the whitelist
     takes precedence).
     """
     tf = TitleFilter(whitelist=['always ok'], blacklist=['always ok'])
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
 def testWhitelistTakesPrecedenceOverBlacklist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a blacklist that contain the same title must work (the whitelist
     takes precedence).
     """
     tf = TitleFilter(whitelist=["always ok"], blacklist=["always ok"])
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok"))
 def testNegativeRegex(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must work.
     """
     tf = TitleFilter(negativeRegex=r"x+\s")
     self.assertEqual(TitleFilter.REJECT, tf.accept("hey xxx you"))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("hey xxyou"))
 def testWhitelist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     must work even when a title is ruled out for other violations.
     """
     tf = TitleFilter(whitelist=["always ok"], negativeRegex="ok")
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok"))
     self.assertEqual(TitleFilter.REJECT, tf.accept("always ok not"))
Beispiel #12
0
 def testWhitelist(self):
     """
     Testing for acceptance against a title filter with a whitelist
     must work even when a title is ruled out for other violations.
     """
     tf = TitleFilter(whitelist=['always ok'], negativeRegex='ok')
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('always ok not'))
Beispiel #13
0
 def testNegativeRegex(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must work.
     """
     tf = TitleFilter(negativeRegex=r'x+\s')
     self.assertEqual(TitleFilter.REJECT, tf.accept('hey xxx you'))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('hey xxyou'))
Beispiel #14
0
 def testPositiveRegexHasPrecedenceOverRepeatedTruncatedTitle(self):
     """
     Testing for acceptance against a title filter with a positive regex
     must have precedence over checking for truncated titles when the same
     non-matching title (that will be truncated) is passed twice.
     """
     tf = TitleFilter(positiveRegex=r'xxxxx', truncateAfter='virus')
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
Beispiel #15
0
 def testWhitelistOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a negative regex that matches everything.
     """
     tf = TitleFilter(whitelist=['always ok'], negativeRegex='.')
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('always ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('always not ok'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('rubbish'))
Beispiel #16
0
 def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must have precedence over checking for truncated titles when the same
     matching title (that will be truncated) is passed twice.
     """
     tf = TitleFilter(negativeRegex=r'spotty', truncateAfter='virus')
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
     self.assertEqual(TitleFilter.REJECT, tf.accept('spotty virus 1'))
 def testWhitelistOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist
     and a negative regex that matches everything.
     """
     tf = TitleFilter(whitelist=["always ok"], negativeRegex=".")
     self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept("always ok"))
     self.assertEqual(TitleFilter.REJECT, tf.accept("always not ok"))
     self.assertEqual(TitleFilter.REJECT, tf.accept("rubbish"))
 def testNegativeRegexHasPrecedenceOverRepeatedTruncatedTitle(self):
     """
     Testing for acceptance against a title filter with a negative regex
     must have precedence over checking for truncated titles when the same
     matching title (that will be truncated) is passed twice.
     """
     tf = TitleFilter(negativeRegex=r"spotty", truncateAfter="virus")
     self.assertEqual(TitleFilter.REJECT, tf.accept("spotty virus 1"))
     self.assertEqual(TitleFilter.REJECT, tf.accept("spotty virus 1"))
Beispiel #19
0
 def testBlacklistFile(self):
     """
     Testing for acceptance against a title filter with a blacklist file.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         tf = TitleFilter(blacklistFile='black.txt')
         self.assertEqual(TitleFilter.REJECT, tf.accept('id1'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id2'))
         self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
Beispiel #20
0
 def testBlacklistFile(self):
     """
     Testing for acceptance against a title filter with a blacklist file.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(blacklistFile='black.txt')
         self.assertEqual(TitleFilter.REJECT, tf.accept('id1'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id2'))
         self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id3'))
 def testPartialWordTruncation(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must work if the title contains the C{truncateAfter} string
     as a partial word.
     """
     tf = TitleFilter(truncateAfter=r"virus")
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("gi|400684|gb|AY421767.1| rotavirus 1"))
     self.assertEqual(TitleFilter.REJECT, tf.accept("gi|400684|gb|AY421767.1| rotavirus 2"))
 def testWordTruncationRepeat(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must allow the exact same title twice, even if the title
     is being truncated.
     """
     tf = TitleFilter(truncateAfter=r"virus")
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("gi|400684|gb|AY421767.1| herpes virus 1"))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept("gi|400684|gb|AY421767.1| herpes virus 1"))
Beispiel #23
0
 def testWhitelistFileOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist file
     and a negative regex that matches everything.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.')
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
Beispiel #24
0
 def testWhitelistFileOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist file
     and a negative regex that matches everything.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(whitelistFile='white.txt', negativeRegex='.')
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
Beispiel #25
0
 def testBlacklistFileAndBlacklist(self):
     """
     Testing for acceptance against a title filter with a blacklist file and
     some specific other blacklist titles.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(blacklistFile='black.txt', blacklist=set(['id3']))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id1'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id2'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id3'))
         self.assertEqual(TitleFilter.DEFAULT_ACCEPT, tf.accept('id4'))
Beispiel #26
0
 def testPartialWordTruncation(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must work if the title contains the C{truncateAfter} string
     as a partial word.
     """
     tf = TitleFilter(truncateAfter=r'virus')
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| rotavirus 1'))
     self.assertEqual(TitleFilter.REJECT,
                      tf.accept('gi|400684|gb|AY421767.1| rotavirus 2'))
Beispiel #27
0
 def testWordTruncationRepeat(self):
     """
     Testing for acceptance against a title filter with title truncation
     in effect must allow the exact same title twice, even if the title
     is being truncated.
     """
     tf = TitleFilter(truncateAfter=r'virus')
     # Note that the truncation code will chop off the first part of the
     # title (the title ID).
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
     self.assertEqual(TitleFilter.DEFAULT_ACCEPT,
                      tf.accept('gi|400684|gb|AY421767.1| herpes virus 1'))
Beispiel #28
0
 def testWhitelistFileAndWhitelistOnly(self):
     """
     Testing for acceptance against a title filter with a whitelist file
     and some specific whitelist titles, with a negative regex that matches
     everything.
     """
     data = '\n'.join(['id1', 'id2']) + '\n'
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         tf = TitleFilter(whitelistFile='white.txt',
                          whitelist=set(['id3']),
                          negativeRegex='.')
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id1'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id2'))
         self.assertEqual(TitleFilter.WHITELIST_ACCEPT, tf.accept('id3'))
         self.assertEqual(TitleFilter.REJECT, tf.accept('id4'))
Beispiel #29
0
    def __init__(self, limit=None, maxAlignmentsPerRead=None,
                 minSequenceLen=None, maxSequenceLen=None,
                 minStart=None, maxStop=None,
                 oneAlignmentPerRead=False, maxHspsPerHit=None,
                 scoreCutoff=None, whitelist=None, blacklist=None,
                 whitelistFile=None, blacklistFile=None,
                 titleRegex=None, negativeTitleRegex=None,
                 truncateTitlesAfter=None, taxonomy=None, readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile or
                titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                whitelistFile=whitelistFile, blacklistFile=blacklistFile,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
Beispiel #30
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex,
                truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex and
                    readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None and
                             length < minSequenceLen) or
                            (maxSequenceLen is not None and
                             length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None and
                                 hsp.readStartInSubject < minStart)
                                or (maxStop is not None and
                                    hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
Beispiel #31
0
class ReadsAlignmentsFilter(object):
    """
    Provide a filter for C{ReadsAlignments} instances.

    @param limit: An C{int} limit on the number of records to read.
    @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a
        read may have in order not to be filtered out. Reads with a greater
        number of alignments will be elided. Pass 0 to filter out reads that
        did not match (i.e., align to) any subjects. Use C{None} for no
        max alignments filtering.
    @param minSequenceLen: Sequences of lesser length will be elided.
    @param maxSequenceLen: Sequences of greater length will be elided.
    @param minStart: HSPs that start before this offset in the matched
        sequence should not be returned.
    @param maxStop: HSPs that end after this offset in the matched sequence
        should not be returned.
    @param oneAlignmentPerRead: If C{True}, only keep the best
        alignment for each read.
    @param maxHspsPerHit: The maximum number of HSPs to keep for each
        alignment for each read.
    @param scoreCutoff: A C{float} score. Matches with scores that are not
        better than this score will be ignored.
    @param whitelist: If not C{None}, a set of exact titles that are always
        acceptable (though the match info for a whitelist title may rule it
        out for other reasons).
    @param blacklist: If not C{None}, a set of exact titles that are never
        acceptable.
    @param whitelistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are always acceptable.
    @param blacklistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are never acceptable.
    @param titleRegex: A regex that sequence titles must match.
    @param negativeTitleRegex: A regex that sequence titles must not match.
    @param truncateTitlesAfter: A string that titles will be truncated
        beyond. If a truncated title has already been seen, that title will
        be elided.
    @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic
        group on which should be filtered. eg 'Vira' will filter on
        viruses, while 11118 will filter on Coronaviridae.
    @param readIdRegex: A case-sensitive regex C{str} that read ids must
        match.
    @return: C{self}.
    """
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0

    def filter(self, readAlignments):
        """
        Filter a read's alignments.

        @param readAlignments: A C{ReadAlignments} instance.
        @return: A C{ReadAlignments} instance if the passed
            C{readAlignments} is not filtered out, else C{False}.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the desired filtering
        #    settings on each call the way the current code does.
        #
        # 3. A better approach with readIdRegex would be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #
        if self.limit is not None and self.count == self.limit:
            return False

        # Does the read have too many alignments?
        if (self.maxAlignmentsPerRead is not None
                and len(readAlignments) > self.maxAlignmentsPerRead):
            return False

        # Filter on the read id.
        if (self.readIdRegex
                and self.readIdRegex.search(readAlignments.read.id) is None):
            return False

        if self.titleFilter:
            # Remove alignments against sequences whose titles are
            # unacceptable.
            wantedAlignments = []
            for alignment in readAlignments:
                if (self.titleFilter.accept(alignment.subjectTitle) !=
                        TitleFilter.REJECT):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Only return alignments that are against sequences of the
        # desired length.
        minSequenceLen = self.minSequenceLen
        maxSequenceLen = self.maxSequenceLen
        if minSequenceLen is not None or maxSequenceLen is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                length = alignment.subjectLength
                if not (
                    (minSequenceLen is not None and length < minSequenceLen) or
                    (maxSequenceLen is not None
                     and length > self.maxSequenceLen)):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.taxonomy is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                lineage = self.lineageFetcher.lineage(alignment.subjectTitle)
                if lineage:
                    for taxonomyIdAndScientificName in lineage:
                        if self.taxonomy in taxonomyIdAndScientificName:
                            wantedAlignments.append(alignment)
                else:
                    # No lineage info was found. Keep the alignment
                    # since we can't rule it out.  We could add another
                    # option to control this.
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.oneAlignmentPerRead and readAlignments:
            readAlignments[:] = [bestAlignment(readAlignments)]

        #
        # From here on we do only HSP-based filtering.
        #

        # Throw out any unwanted HSPs due to maxHspsPerHit.
        if self.maxHspsPerHit is not None:
            for alignment in readAlignments:
                hsps = alignment.hsps
                if len(hsps) > self.maxHspsPerHit:
                    alignment.hsps = hsps[:self.maxHspsPerHit]

        # Throw out HSPs whose scores are not good enough.
        if self.scoreCutoff is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if hsp.betterThan(self.scoreCutoff):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Throw out HSPs that don't match in the desired place on the
        # matched sequence.
        minStart = self.minStart
        maxStop = self.maxStop
        if minStart is not None or maxStop is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if not ((minStart is not None
                             and hsp.readStartInSubject < minStart) or
                            (maxStop is not None
                             and hsp.readEndInSubject > maxStop)):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        self.count += 1
        return readAlignments

    def close(self):
        """
        Close our lineage fetcher, if any.
        """
        if self.taxonomy:
            self.lineageFetcher.close()
Beispiel #32
0
    def filter(self,
               minMatchingReads=None,
               minMedianScore=None,
               withScoreBetterThan=None,
               minNewReads=None,
               minCoverage=None,
               maxTitles=None,
               sortOn='maxScore',
               titleRegex=None,
               negativeTitleRegex=None):
        """
        Filter the titles in self to create another TitlesAlignments.

        @param minMatchingReads: titles that are matched by fewer reads
            are unacceptable.
        @param minMedianScore: sequences that are matched with a median
            bit score that is less are unacceptable.
        @param withScoreBetterThan: if the best score for a title is not
            as good as this value, the title is not acceptable.
        @param minNewReads: The C{float} fraction of its reads by which a new
            title's read set must differ from the read sets of all previously
            seen titles in order for this title to be considered acceptably
            different (and therefore interesting).
        @param minCoverage: The C{float} minimum fraction of the title sequence
            that must be matched by at least one read.
        @param maxTitles: A non-negative C{int} maximum number of titles to
            keep. If more titles than this are present, titles will be sorted
            (according to C{sortOn}) and only the best will be retained.
        @param sortOn: A C{str} attribute to sort on, used only if C{maxTitles}
            is not C{None}. See the C{sortTitles} method below for the legal
            values.
        @param titleRegex: A regex that read ids must match.
        @param negativeTitleRegex: A regex that read ids must not match.
        @raise: C{ValueError} if C{maxTitles} is less than zero or the value of
            C{sortOn} is unknown.
        @return: A new L{TitlesAlignments} instance containing only the
            matching titles.
        """
        # Use a ReadSetFilter only if we're checking that read sets are
        # sufficiently new.
        if minNewReads is None:
            readSetFilter = None
        else:
            if self.readSetFilter is None:
                self.readSetFilter = ReadSetFilter(minNewReads)
            readSetFilter = self.readSetFilter

        result = TitlesAlignments(self.readsAlignments,
                                  self.scoreClass,
                                  self.readSetFilter,
                                  importReadsAlignmentsTitles=False)

        if maxTitles is not None and len(self) > maxTitles:
            if maxTitles < 0:
                raise ValueError('maxTitles (%r) cannot be negative.' %
                                 maxTitles)
            else:
                # There are too many titles. Make a sorted list of them so
                # we loop through them (below) in the desired order and can
                # break when/if we've reached the maximum. We can't just
                # take the first maxTitles titles from the sorted list now,
                # as some of those titles might later be discarded by the
                # filter and then we'd return a result with fewer titles
                # than we should.
                titles = self.sortTitles(sortOn)
        else:
            titles = self.keys()

        if (titleRegex or negativeTitleRegex):
            titleFilter = TitleFilter(positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex)
        else:
            titleFilter = None

        for title in titles:
            # Test max titles up front, as it may be zero.
            if maxTitles is not None and len(result) == maxTitles:
                break

            # Test positive and negative regexps.
            if (titleFilter
                    and titleFilter.accept(title) == TitleFilter.REJECT):
                continue

            titleAlignments = self[title]
            if (minMatchingReads is not None
                    and titleAlignments.readCount() < minMatchingReads):
                continue

            # To compare the median score with another score, we must
            # convert both values to instances of the score class used in
            # this data set so they can be compared without us needing to
            # know if numerically greater scores are considered better or
            # not.
            if (minMedianScore is not None
                    and self.scoreClass(titleAlignments.medianScore()) <
                    self.scoreClass(minMedianScore)):
                continue

            if (withScoreBetterThan is not None and
                    not titleAlignments.hasScoreBetterThan(withScoreBetterThan)
                ):
                continue

            if (minCoverage is not None
                    and titleAlignments.coverage() < minCoverage):
                continue

            if (readSetFilter
                    and not readSetFilter.accept(title, titleAlignments)):
                continue

            result.addTitle(title, titleAlignments)

        return result
Beispiel #33
0
class ReadsAlignmentsFilter(object):
    """
    Provide a filter for C{ReadsAlignments} instances.

    @param limit: An C{int} limit on the number of records to read.
    @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a
        read may have in order not to be filtered out. Reads with a greater
        number of alignments will be elided. Pass 0 to filter out reads that
        did not match (i.e., align to) any subjects. Use C{None} for no
        max alignments filtering.
    @param minSequenceLen: Sequences of lesser length will be elided.
    @param maxSequenceLen: Sequences of greater length will be elided.
    @param minStart: HSPs that start before this offset in the matched
        sequence should not be returned.
    @param maxStop: HSPs that end after this offset in the matched sequence
        should not be returned.
    @param oneAlignmentPerRead: If C{True}, only keep the best
        alignment for each read.
    @param maxHspsPerHit: The maximum number of HSPs to keep for each
        alignment for each read.
    @param scoreCutoff: A C{float} score. Matches with scores that are not
        better than this score will be ignored.
    @param whitelist: If not C{None}, a set of exact titles that are always
        acceptable (though the match info for a whitelist title may rule it
        out for other reasons).
    @param blacklist: If not C{None}, a set of exact titles that are never
        acceptable.
    @param titleRegex: A regex that sequence titles must match.
    @param negativeTitleRegex: A regex that sequence titles must not match.
    @param truncateTitlesAfter: A string that titles will be truncated
        beyond. If a truncated title has already been seen, that title will
        be elided.
    @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic
        group on which should be filtered. eg 'Vira' will filter on
        viruses, while 11118 will filter on Coronaviridae.
    @param readIdRegex: A case-sensitive regex C{str} that read ids must
        match.
    @return: C{self}.
    """
    def __init__(self, limit=None, maxAlignmentsPerRead=None,
                 minSequenceLen=None, maxSequenceLen=None,
                 minStart=None, maxStop=None,
                 oneAlignmentPerRead=False, maxHspsPerHit=None,
                 scoreCutoff=None, whitelist=None, blacklist=None,
                 titleRegex=None, negativeTitleRegex=None,
                 truncateTitlesAfter=None, taxonomy=None, readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            self.titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0

    def filter(self, readAlignments):
        """
        Filter a read's alignments.

        @param readAlignments: A C{ReadAlignments} instance.
        @return: A C{ReadAlignments} instance if the passed
            C{readAlignments} is not filtered out, else C{False}.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the desired filtering
        #    settings on each call the way the current code does.
        #
        # 3. A better approach with readIdRegex would be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #
        if self.limit is not None and self.count == self.limit:
            return False

        # Does the read have too many alignments?
        if (self.maxAlignmentsPerRead is not None and
                len(readAlignments) > self.maxAlignmentsPerRead):
            return False

        # Filter on the read id.
        if (self.readIdRegex and
                self.readIdRegex.search(readAlignments.read.id) is None):
            return False

        if self.titleFilter:
            # Remove alignments against sequences whose titles are
            # unacceptable.
            wantedAlignments = []
            for alignment in readAlignments:
                if (self.titleFilter.accept(alignment.subjectTitle) !=
                        TitleFilter.REJECT):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Only return alignments that are against sequences of the
        # desired length.
        minSequenceLen = self.minSequenceLen
        maxSequenceLen = self.maxSequenceLen
        if minSequenceLen is not None or maxSequenceLen is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                length = alignment.subjectLength
                if not ((minSequenceLen is not None and
                         length < minSequenceLen) or
                        (maxSequenceLen is not None and
                         length > self.maxSequenceLen)):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.taxonomy is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                lineage = self.lineageFetcher.lineage(alignment.subjectTitle)
                if lineage:
                    for taxonomyIdAndScientificName in lineage:
                        if self.taxonomy in taxonomyIdAndScientificName:
                            wantedAlignments.append(alignment)
                else:
                    # No lineage info was found. Keep the alignment
                    # since we can't rule it out.  We could add another
                    # option to control this.
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.oneAlignmentPerRead and readAlignments:
            readAlignments[:] = [bestAlignment(readAlignments)]

        #
        # From here on we do only HSP-based filtering.
        #

        # Throw out any unwanted HSPs due to maxHspsPerHit.
        if self.maxHspsPerHit is not None:
            for alignment in readAlignments:
                hsps = alignment.hsps
                if len(hsps) > self.maxHspsPerHit:
                    alignment.hsps = hsps[:self.maxHspsPerHit]

        # Throw out HSPs whose scores are not good enough.
        if self.scoreCutoff is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if hsp.betterThan(self.scoreCutoff):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Throw out HSPs that don't match in the desired place on the
        # matched sequence.
        minStart = self.minStart
        maxStop = self.maxStop
        if minStart is not None or maxStop is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if not ((minStart is not None and
                             hsp.readStartInSubject < minStart) or
                            (maxStop is not None and
                             hsp.readEndInSubject > maxStop)):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        self.count += 1
        return readAlignments

    def close(self):
        """
        Close our lineage fetcher, if any.
        """
        if self.taxonomy:
            self.lineageFetcher.close()
Beispiel #34
0
class ProteinGrouper(object):
    """
    Group matched proteins by the pathogen they come from.

    @param assetDir: The C{str} directory name where
        C{noninteractive-alignment-panel.py} put its HTML, blue plot and
        alignment panel images, and FASTA or FASTQ files. This must be relative
        to the filenames that will later be passed to C{addFile}.
    @param sampleName: A C{str} sample name. This takes precedence over
        C{sampleNameRegex} (the two cannot be used together, obviously).
    @param sampleNameRegex: A C{str} regular expression that can be used to
        extract a short sample name from full file names subsequently passed
        to C{self.addFile}. The regular expression must have a matching group
        (delimited by parentheses) to capture the part of the file name that
        should be used as the sample name.
    @param format_: A C{str}, either 'fasta' or 'fastq' indicating the format
        of the files containing the reads matching proteins.
    @param proteinFastaFilenames: If not C{None}, a C{list} of C{str} filenames
        giving the name of the FASTA file with the protein AA sequences with
        their associated pathogens in square brackets. This is the format used
        by NCBI for the bacterial and viral reference sequence protein files.
        If given, the contents of this file will be used to determine how many
        proteins each matched pathogen has.
    @param saveReadLengths: If C{True}, save the lengths of all reads matching
        proteins.
    @param titleRegex: A regex that pathogen names must match.
        Note that this matching is done on the final part of the protein title
        in square brackets, according to the convention used by the NCBI viral
        refseq database and RVDB.
    @param negativeTitleRegex: A regex that pathogen names must not match.
        Note that this matching is done on the final part of the protein title
        in square brackets, according to the convention used by the NCBI viral
        refseq database and RVDB.
    @param pathogenDataDir: The C{str} directory where per-pathogen information
        (e.g., collected reads across all samples) should be written. Will be
        created (in C{self.toHTML}) if it doesn't exist.
    @raise ValueError: If C{format_} is unknown.
    """

    VIRALZONE = 'https://viralzone.expasy.org/search?query='
    ICTV = 'https://talk.ictvonline.org/search-124283882/?q='
    READCOUNT_MARKER = '*READ-COUNT*'
    READ_AND_HSP_COUNT_STR_SEP = '/'

    def __init__(self, assetDir='out', sampleName=None, sampleNameRegex=None,
                 format_='fasta', proteinFastaFilenames=None,
                 saveReadLengths=False, titleRegex=None,
                 negativeTitleRegex=None, pathogenDataDir='pathogen-data'):
        self._assetDir = assetDir
        self._sampleName = sampleName
        self._sampleNameRegex = (re.compile(sampleNameRegex) if sampleNameRegex
                                 else None)
        if format_ in ('fasta', 'fastq'):
            self._format = format_
        else:
            raise ValueError("format_ must be either 'fasta' or 'fastq'.")
        self._saveReadLengths = saveReadLengths

        if titleRegex or negativeTitleRegex:
            self.titleFilter = TitleFilter(
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex)
        else:
            self.titleFilter = None

        self._pathogenDataDir = pathogenDataDir

        self._pathogenProteinCount = getPathogenProteinCounts(
            proteinFastaFilenames)

        # pathogenNames will be a dict of dicts of dicts. The first two keys
        # will be a pathogen name and a sample name. The final dict will
        # contain 'proteins' (a list of dicts) and 'uniqueReadCount' (an int).
        self.pathogenNames = {}
        # sampleNames is keyed by sample name and will have values that hold
        # the sample's alignment panel index.html file.
        self.sampleNames = {}
        self.pathogenSampleFiles = PathogenSampleFiles(self, format_=format_)

    def _title(self):
        """
        Create a title summarizing the pathogens and samples.

        @return: A C{str} title.
        """
        return (
            'Overall, proteins from %d pathogen%s were found in %d sample%s.' %
            (len(self.pathogenNames),
             '' if len(self.pathogenNames) == 1 else 's',
             len(self.sampleNames),
             '' if len(self.sampleNames) == 1 else 's'))

    def addFile(self, filename, fp):
        """
        Read and record protein information for a sample.

        @param filename: A C{str} file name.
        @param fp: An open file pointer to read the file's data from.
        @raise ValueError: If information for a pathogen/protein/sample
            combination is given more than once.
        """
        if self._sampleName:
            sampleName = self._sampleName
        elif self._sampleNameRegex:
            match = self._sampleNameRegex.search(filename)
            if match:
                sampleName = match.group(1)
            else:
                sampleName = filename
        else:
            sampleName = filename

        outDir = join(dirname(filename), self._assetDir)

        self.sampleNames[sampleName] = join(outDir, 'index.html')

        for index, proteinLine in enumerate(fp):
            proteinLine = proteinLine[:-1]
            (coverage, medianScore, bestScore, readCount, hspCount,
             proteinLength, names) = proteinLine.split(None, 6)

            proteinName, pathogenName = splitNames(names)

            # Ignore pathogens with names we don't want.
            if (self.titleFilter and self.titleFilter.accept(
                    pathogenName) == TitleFilter.REJECT):
                continue

            if pathogenName not in self.pathogenNames:
                self.pathogenNames[pathogenName] = {}

            if sampleName not in self.pathogenNames[pathogenName]:
                self.pathogenNames[pathogenName][sampleName] = {
                    'proteins': {},
                    'uniqueReadCount': None,
                }

            proteins = self.pathogenNames[pathogenName][sampleName]['proteins']

            # We should only receive one line of information for a given
            # pathogen/sample/protein combination.
            if proteinName in proteins:
                raise ValueError(
                    'Protein %r already seen for pathogen %r sample %r.' %
                    (proteinName, pathogenName, sampleName))

            readsFilename = join(outDir, '%d.%s' % (index, self._format))

            if proteinName.count('|') < 5:
                # Assume this is an NCBI refseq id, like
                # YP_009137153.1 uracil glycosylase [Human alphaherpesvirus 2]
                # with a protein but not a genome accession.
                proteinURL = NCBISequenceLinkURL(proteinName, field=0,
                                                 delim=' ')
                genomeURL = None
            else:
                # Assume this is an RVDB id, like
                # acc|GENBANK|ABJ91970.1|GENBANK|DQ876317|pol protein [HIV]
                # with both protein and genome accession numbers.
                proteinURL = NCBISequenceLinkURL(proteinName, field=2)
                genomeURL = NCBISequenceLinkURL(proteinName, field=4)

            proteinInfo = proteins[proteinName] = {
                'bestScore': float(bestScore),
                'bluePlotFilename': join(outDir, '%d.png' % index),
                'coverage': float(coverage),
                'readsFilename': readsFilename,
                'hspCount': int(hspCount),
                'index': index,
                'medianScore': float(medianScore),
                'outDir': outDir,
                'proteinLength': int(proteinLength),
                'proteinName': proteinName,
                'proteinURL': proteinURL,
                'genomeURL': genomeURL,
                'readCount': int(readCount),
            }

            if proteinInfo['readCount'] == proteinInfo['hspCount']:
                proteinInfo['readAndHspCountStr'] = readCount
            else:
                proteinInfo['readAndHspCountStr'] = '%s%s%s' % (
                    readCount, self.READ_AND_HSP_COUNT_STR_SEP, hspCount)

            if self._saveReadLengths:
                readsClass = (FastaReads if self._format == 'fasta'
                              else FastqReads)
                proteins[proteinName]['readLengths'] = tuple(
                    len(read) for read in readsClass(readsFilename))

    def _computeUniqueReadCounts(self):
        """
        Add all pathogen / sample combinations to self.pathogenSampleFiles.

        This will make all de-duplicated (by id) FASTA/FASTQ files and store
        the number of de-duplicated reads into C{self.pathogenNames}.
        """
        for pathogenName, samples in self.pathogenNames.items():
            for sampleName in samples:
                self.pathogenSampleFiles.add(pathogenName, sampleName)

    def toStr(self, title='Summary of pathogens', preamble=None):
        """
        Produce a string representation of the pathogen summary.

        @param title: The C{str} title for the output.
        @param preamble: The C{str} descriptive preamble for the HTML page, or
            C{None} if no preamble is needed.
        @return: A C{str} suitable for printing.
        """
        # Note that the string representation contains much less
        # information than the HTML summary. E.g., it does not contain the
        # unique (de-duplicated, by id) read count, since that is only computed
        # when we are making combined FASTA files of reads matching a
        # pathogen.
        readCountGetter = itemgetter('readCount')
        result = []
        append = result.append

        result.extend((title, ''))
        if preamble:
            result.extend((preamble, ''))
        result.extend((self._title(), ''))

        for pathogenName in sorted(self.pathogenNames):
            samples = self.pathogenNames[pathogenName]
            sampleCount = len(samples)
            append('%s (in %d sample%s)' %
                   (pathogenName,
                    sampleCount, '' if sampleCount == 1 else 's'))
            for sampleName in sorted(samples):
                proteins = samples[sampleName]['proteins']
                proteinCount = len(proteins)
                totalReads = sum(readCountGetter(p) for p in proteins.values())
                append('  %s (%d protein%s, %d read%s)' %
                       (sampleName,
                        proteinCount, '' if proteinCount == 1 else 's',
                        totalReads, '' if totalReads == 1 else 's'))
                for proteinName in sorted(proteins):
                    append(
                        '    %(coverage).2f\t%(medianScore).2f\t'
                        '%(bestScore).2f\t%(readAndHspCountStr)11s\t'
                        '%(proteinName)s'
                        % proteins[proteinName])
            append('')

        return '\n'.join(result)

    def toHTML(self, pathogenPanelFilename=None, minProteinFraction=0.0,
               pathogenType='viral', title='Summary of pathogens',
               preamble=None, sampleIndexFilename=None,
               pathogenIndexFilename=None, omitVirusLinks=False,
               omitSampleProteinCount=False):
        """
        Produce an HTML string representation of the pathogen summary.

        @param pathogenPanelFilename: If not C{None}, a C{str} filename to
            write a pathogen panel PNG image to.
        @param minProteinFraction: The C{float} minimum fraction of proteins
            in a pathogen that must be matched by a sample in order for that
            pathogen to be displayed for that sample.
        @param pathogenType: A C{str} giving the type of the pathogen involved,
            either 'bacterial' or 'viral'.
        @param title: The C{str} title for the HTML page.
        @param preamble: The C{str} descriptive preamble for the HTML page, or
            C{None} if no preamble is needed.
        @param sampleIndexFilename: A C{str} filename to write a sample index
            file to. Lines in the file will have an integer index, a space, and
            then the sample name.
        @param pathogenIndexFilename: A C{str} filename to write a pathogen
            index file to. Lines in the file will have an integer index, a
            space, and then the pathogen name.
        @param omitVirusLinks: If C{True}, links to ICTV and ViralZone will be
            omitted in output.
        @param omitSampleProteinCount: If C{True}, do not display a number of
            matched pathogen proteins for a sample. This should be used when
            those numbers are inaccurate (e.g., when using the unclustered RVDB
            protein database and there are many sequences for the same
            protein).
        @return: An HTML C{str} suitable for printing.
        """
        if pathogenType not in ('bacterial', 'viral'):
            raise ValueError(
                "Unrecognized pathogenType argument: %r. Value must be either "
                "'bacterial' or 'viral'." % pathogenType)

        if not exists(self._pathogenDataDir):
            os.mkdir(self._pathogenDataDir)

        self._computeUniqueReadCounts()

        if pathogenPanelFilename:
            self.pathogenPanel(pathogenPanelFilename)

        if sampleIndexFilename:
            with open(sampleIndexFilename, 'w') as fp:
                self.pathogenSampleFiles.writeSampleIndex(fp)

        if pathogenIndexFilename:
            with open(pathogenIndexFilename, 'w') as fp:
                self.pathogenSampleFiles.writePathogenIndex(fp)

        # Figure out if we have to delete some pathogens because the
        # fraction of their proteins that we have matches for is too low.
        if minProteinFraction > 0.0:
            toDelete = defaultdict(list)
            for pathogenName in self.pathogenNames:
                proteinCount = self._pathogenProteinCount[pathogenName]
                for s in self.pathogenNames[pathogenName]:
                    if proteinCount:
                        sampleProteinFraction = (
                            len(self.pathogenNames[
                                pathogenName][s]['proteins']) /
                            proteinCount)
                    else:
                        sampleProteinFraction = 1.0
                    if sampleProteinFraction < minProteinFraction:
                        toDelete[pathogenName].append(s)

            for pathogenName in toDelete:
                for sample in toDelete[pathogenName]:
                    del self.pathogenNames[pathogenName][sample]

        pathogenNames = sorted(
            pathogenName for pathogenName in self.pathogenNames
            if len(self.pathogenNames[pathogenName]) > 0)
        nPathogenNames = len(pathogenNames)
        sampleNames = sorted(self.sampleNames)

        result = [
            '<html>',
            '<head>',
            '<title>',
            title,
            '</title>',
            '<meta charset="UTF-8">',
            '</head>',
            '<body>',
            '<style>',
            '''\
            body {
                margin-left: 2%;
                margin-right: 2%;
            }
            hr {
                display: block;
                margin-top: 0.5em;
                margin-bottom: 0.5em;
                margin-left: auto;
                margin-right: auto;
                border-style: inset;
                border-width: 1px;
            }
            p.pathogen {
                margin-top: 10px;
                margin-bottom: 3px;
            }
            p.sample {
                margin-top: 10px;
                margin-bottom: 3px;
            }
            .sample {
                margin-top: 5px;
                margin-bottom: 2px;
            }
            ul {
                margin-bottom: 2px;
            }
            .indented {
                margin-left: 2em;
            }
            .sample-name {
                font-size: 125%;
                font-weight: bold;
            }
            .pathogen-name {
                font-size: 125%;
                font-weight: bold;
            }
            .index-name {
                font-weight: bold;
            }
            .index {
                font-size: small;
            }
            .protein-name {
                font-family: "Courier New", Courier, monospace;
            }
            .stats {
                font-family: "Courier New", Courier, monospace;
                white-space: pre;
            }
            .protein-list {
                margin-top: 2px;
            }''',
            '</style>',
            '</head>',
            '<body>',
        ]

        proteinFieldsDescription = [
            '<p>',
            'In all bullet point protein lists below, there are the following '
            'fields:',
            '<ol>',
            '<li>Coverage fraction.</li>',
            '<li>Median bit score.</li>',
            '<li>Best bit score.</li>',
            '<li>Read count (if the HSP count differs, read and HSP ',
            ('counts are both given, separated by "%s").</li>' %
             self.READ_AND_HSP_COUNT_STR_SEP),
            '<li>Protein length (in amino acids).</li>',
        ]

        if self._saveReadLengths:
            proteinFieldsDescription.append(
                '<li>All read lengths (in parentheses).</li>')

        proteinFieldsDescription.extend([
            '<li>Protein name.</li>',
            '</ol>',
            '</p>',
        ])

        append = result.append

        append('<h1>%s</h1>' % title)
        if preamble:
            append('<p>%s</p>' % preamble)
        append('<p>')
        append(self._title())

        if self._pathogenProteinCount and minProteinFraction:
            percent = minProteinFraction * 100.0
            if nPathogenNames < len(self.pathogenNames):
                if nPathogenNames == 1:
                    append('Pathogen protein fraction filtering has been '
                           'applied, so information on only 1 pathogen is '
                           'displayed. This is the only pathogen for which at '
                           'least one sample matches at least %.2f%% of the '
                           'pathogen proteins.' % percent)
                else:
                    append('Pathogen protein fraction filtering has been '
                           'applied, so information on only %d pathogens is '
                           'displayed. These are the only pathogens for which '
                           'at least one sample matches at least %.2f%% of '
                           'the pathogen proteins.' % (nPathogenNames,
                                                       percent))
            else:
                append('Pathogen protein fraction filtering was applied, '
                       'but all pathogens have at least %.2f%% of their '
                       'proteins matched by at least one sample.' % percent)

        append('</p>')

        if pathogenPanelFilename:
            append('<p>')
            append('<a href="%s">Panel showing read count per pathogen, per '
                   'sample.</a>' % pathogenPanelFilename)
            append('Red vertical bars indicate samples with an unusually high '
                   'read count.')
            append('</p>')

        result.extend(proteinFieldsDescription)

        # Write a linked table of contents by pathogen.
        append('<p><span class="index-name">Pathogen index:</span>')
        append('<span class="index">')
        for pathogenName in pathogenNames:
            append('<a href="#pathogen-%s">%s</a>' % (pathogenName,
                                                      pathogenName))
            append('&middot;')
        # Get rid of final middle dot and add a period.
        result.pop()
        result[-1] += '.'
        append('</span></p>')

        # Write a linked table of contents by sample.
        append('<p><span class="index-name">Sample index:</span>')
        append('<span class="index">')
        for sampleName in sampleNames:
            append('<a href="#sample-%s">%s</a>' % (sampleName, sampleName))
            append('&middot;')
        # Get rid of final middle dot and add a period.
        result.pop()
        result[-1] += '.'
        append('</span></p>')

        # Write all pathogens (with samples (with proteins)).
        append('<hr>')
        append('<h1>Pathogens by sample</h1>')

        for pathogenName in pathogenNames:
            samples = self.pathogenNames[pathogenName]
            sampleCount = len(samples)
            pathogenProteinCount = self._pathogenProteinCount[pathogenName]
            if pathogenType == 'viral' and not omitVirusLinks:
                quoted = quote(pathogenName)
                pathogenLinksHTML = (
                    ' (<a href="%s%s">ICTV</a>, <a href="%s%s">ViralZone</a>)'
                ) % (self.ICTV, quoted, self.VIRALZONE, quoted)
            else:
                pathogenLinksHTML = ''

            if pathogenProteinCount:
                withStr = (' with %d protein%s' %
                           (pathogenProteinCount,
                            '' if pathogenProteinCount == 1 else 's'))
            else:
                withStr = ''

            pathogenIndex = self.pathogenSampleFiles.pathogenIndex(
                pathogenName)

            pathogenReadsFilename = join(
                self._pathogenDataDir,
                'pathogen-%d.%s' % (pathogenIndex, self._format))

            pathogenReadsFp = open(pathogenReadsFilename, 'w')
            pathogenReadCount = 0

            append(
                '<a id="pathogen-%s"></a>'
                '<p class="pathogen">'
                '<span class="pathogen-name">%s</span>'
                '%s %s, '
                'was matched by %d sample%s '
                '(<a href="%s">%s</a> in total):'
                '</p>' %
                (pathogenName,
                 pathogenName,
                 pathogenLinksHTML, withStr,
                 sampleCount, '' if sampleCount == 1 else 's',
                 pathogenReadsFilename, self.READCOUNT_MARKER))

            # Remember where we are in the output result so we can fill in
            # the total read count once we have processed all samples for
            # this pathogen. Not nice, I know.
            pathogenReadCountLineIndex = len(result) - 1

            for sampleName in sorted(samples):
                readsFileName = self.pathogenSampleFiles.lookup(
                    pathogenName, sampleName)

                # Copy the read data from the per-sample reads for this
                # pathogen into the per-pathogen file of reads.
                with open(readsFileName) as readsFp:
                    while True:
                        data = readsFp.read(4096)
                        if data:
                            pathogenReadsFp.write(data)
                        else:
                            break

                proteins = samples[sampleName]['proteins']
                proteinCount = len(proteins)
                uniqueReadCount = samples[sampleName]['uniqueReadCount']
                pathogenReadCount += uniqueReadCount

                if omitSampleProteinCount:
                    proteinCountHTML = ''
                else:
                    proteinCountHTML = '%d protein%s, ' % (
                        proteinCount, '' if proteinCount == 1 else 's')

                append(
                    '<p class="sample indented">'
                    'Sample <a href="#sample-%s">%s</a> '
                    '(%s<a href="%s">%d de-duplicated (by id) '
                    'read%s</a>, <a href="%s">panel</a>):</p>' %
                    (sampleName, sampleName,
                     proteinCountHTML,
                     readsFileName,
                     uniqueReadCount, '' if uniqueReadCount == 1 else 's',
                     self.sampleNames[sampleName]))
                append('<ul class="protein-list indented">')
                for proteinName in sorted(proteins):
                    proteinMatch = proteins[proteinName]
                    append(
                        '<li>'
                        '<span class="stats">'
                        '%(coverage).2f %(medianScore)6.2f %(bestScore)6.2f '
                        '%(readAndHspCountStr)11s %(proteinLength)4d '
                        % proteinMatch
                    )

                    if self._saveReadLengths:
                        append('(%s) ' % ', '.join(
                            map(str, sorted(proteinMatch['readLengths']))))

                    append(
                        '</span> '
                        '<span class="protein-name">'
                        '%(proteinName)s'
                        '</span> '
                        '(<a href="%(bluePlotFilename)s">blue plot</a>, '
                        '<a href="%(readsFilename)s">reads</a>'
                        % proteinMatch)

                    if proteinMatch['proteinURL']:
                        # Append this directly to the last string in result, to
                        # avoid introducing whitespace when we join result
                        # using '\n'.
                        result[-1] += (', <a href="%s">NCBI protein</a>' %
                                       proteinMatch['proteinURL'])

                    if proteinMatch['genomeURL']:
                        # Append this directly to the last string in result, to
                        # avoid introducing whitespace when we join result
                        # using '\n'.
                        result[-1] += (', <a href="%s">NCBI genome</a>' %
                                       proteinMatch['genomeURL'])
                    result[-1] += ')'

                    append('</li>')

                append('</ul>')

            pathogenReadsFp.close()

            # Sanity check there's a read count marker text in our output
            # where we expect it.
            readCountLine = result[pathogenReadCountLineIndex]
            if readCountLine.find(self.READCOUNT_MARKER) == -1:
                raise ValueError(
                    'Could not find pathogen read count marker (%s) in result '
                    'index %d text (%s).' %
                    (self.READCOUNT_MARKER, pathogenReadCountLineIndex,
                     readCountLine))

            # Put the read count into the pathogen summary line we wrote
            # earlier, replacing the read count marker with the correct
            # text.
            result[pathogenReadCountLineIndex] = readCountLine.replace(
                self.READCOUNT_MARKER,
                '%d read%s' % (pathogenReadCount,
                               '' if pathogenReadCount == 1 else 's'))

        # Write all samples (with pathogens (with proteins)).
        append('<hr>')
        append('<h1>Samples by pathogen</h1>')

        for sampleName in sampleNames:
            samplePathogenNames = [
                pathName for pathName in self.pathogenNames
                if sampleName in self.pathogenNames[pathName]]

            if len(samplePathogenNames):
                append(
                    '<a id="sample-%s"></a>'
                    '<p class="sample">Sample '
                    '<span class="sample-name">%s</span> '
                    'matched proteins from %d pathogen%s, '
                    '<a href="%s">panel</a>:</p>' %
                    (sampleName, sampleName, len(samplePathogenNames),
                     '' if len(samplePathogenNames) == 1 else 's',
                     self.sampleNames[sampleName]))
            else:
                append(
                    '<a id="sample-%s"></a>'
                    '<p class="sample">Sample '
                    '<span class="sample-name">%s</span> '
                    'did not match anything.</p>' %
                    (sampleName, sampleName))
                continue

            for pathogenName in sorted(samplePathogenNames):
                readsFileName = self.pathogenSampleFiles.lookup(pathogenName,
                                                                sampleName)
                proteins = self.pathogenNames[pathogenName][sampleName][
                    'proteins']
                uniqueReadCount = self.pathogenNames[
                    pathogenName][sampleName]['uniqueReadCount']
                proteinCount = len(proteins)
                pathogenProteinCount = self._pathogenProteinCount[pathogenName]

                if pathogenProteinCount:
                    proteinCountStr = '%d/%d protein%s' % (
                        proteinCount, pathogenProteinCount,
                        '' if pathogenProteinCount == 1 else 's')
                else:
                    proteinCountStr = '%d protein%s' % (
                        proteinCount, '' if proteinCount == 1 else 's')

                append(
                    '<p class="sample indented">'
                    '<a href="#pathogen-%s">%s</a> %s, '
                    '<a href="%s">%d de-duplicated (by id) read%s</a>:</p>' %
                    (pathogenName, pathogenName,
                     proteinCountStr, readsFileName,
                     uniqueReadCount, '' if uniqueReadCount == 1 else 's'))
                append('<ul class="protein-list indented">')
                for proteinName in sorted(proteins):
                    proteinMatch = proteins[proteinName]
                    append(
                        '<li>'
                        '<span class="stats">'
                        '%(coverage).2f %(medianScore)6.2f %(bestScore)6.2f '
                        '%(readAndHspCountStr)11s %(proteinLength)4d '
                        '</span> '
                        '<span class="protein-name">'
                        '%(proteinName)s'
                        '</span> '
                        '(<a href="%(bluePlotFilename)s">blue plot</a>, '
                        '<a href="%(readsFilename)s">reads</a>'
                        % proteinMatch)

                    if proteinMatch['proteinURL']:
                        # Append this directly to the last string in result, to
                        # avoid introducing whitespace when we join result
                        # using '\n'.
                        result[-1] += (', <a href="%s">NCBI protein</a>' %
                                       proteinMatch['proteinURL'])

                    if proteinMatch['genomeURL']:
                        # Append this directly to the last string in result, to
                        # avoid introducing whitespace when we join result
                        # using '\n'.
                        result[-1] += (', <a href="%s">NCBI genome</a>' %
                                       proteinMatch['genomeURL'])

                    result[-1] += ')'

                    append('</li>')

                append('</ul>')

        append('</body>')
        append('</html>')

        return '\n'.join(result)

    def _pathogenSamplePlot(self, pathogenName, sampleNames, ax):
        """
        Make an image of a graph giving pathogen read count (Y axis) versus
        sample id (X axis).

        @param pathogenName: A C{str} pathogen name.
        @param sampleNames: A sorted C{list} of sample names.
        @param ax: A matplotlib C{axes} instance.
        """
        readCounts = []
        for i, sampleName in enumerate(sampleNames):
            try:
                readCount = self.pathogenNames[pathogenName][sampleName][
                    'uniqueReadCount']
            except KeyError:
                readCount = 0
            readCounts.append(readCount)

        highlight = 'r'
        normal = 'gray'
        sdMultiple = 2.5
        minReadsForHighlighting = 10
        highlighted = []

        if len(readCounts) == 1:
            if readCounts[0] > minReadsForHighlighting:
                color = [highlight]
                highlighted.append(sampleNames[0])
            else:
                color = [normal]
        else:
            mean = np.mean(readCounts)
            sd = np.std(readCounts)
            color = []
            for readCount, sampleName in zip(readCounts, sampleNames):
                if (readCount > (sdMultiple * sd) + mean and
                        readCount >= minReadsForHighlighting):
                    color.append(highlight)
                    highlighted.append(sampleName)
                else:
                    color.append(normal)

        nSamples = len(sampleNames)
        x = np.arange(nSamples)
        yMin = np.zeros(nSamples)
        ax.set_xticks([])
        ax.set_xlim((-0.5, nSamples - 0.5))
        ax.vlines(x, yMin, readCounts, color=color)
        if highlighted:
            title = '%s\nIn red: %s' % (
                pathogenName, fill(', '.join(highlighted), 50))
        else:
            # Add a newline to keep the first line of each title at the
            # same place as those titles that have an "In red:" second
            # line.
            title = pathogenName + '\n'

        ax.set_title(title, fontsize=10)
        ax.tick_params(axis='both', which='major', labelsize=8)
        ax.tick_params(axis='both', which='minor', labelsize=6)

    def pathogenPanel(self, filename):
        """
        Make a panel of images, with each image being a graph giving pathogen
        de-duplicated (by id) read count (Y axis) versus sample id (X axis).

        @param filename: A C{str} file name to write the image to.
        """
        import matplotlib
        matplotlib.use('PDF')
        import matplotlib.pyplot as plt

        self._computeUniqueReadCounts()
        pathogenNames = sorted(self.pathogenNames)
        sampleNames = sorted(self.sampleNames)

        cols = 5
        rows = int(len(pathogenNames) / cols) + (
            0 if len(pathogenNames) % cols == 0 else 1)
        figure, ax = plt.subplots(rows, cols, squeeze=False)

        coords = dimensionalIterator((rows, cols))

        for i, pathogenName in enumerate(pathogenNames):
            row, col = next(coords)
            self._pathogenSamplePlot(pathogenName, sampleNames, ax[row][col])

        # Hide the final panel graphs (if any) that have no content. We do
        # this because the panel is a rectangular grid and some of the
        # plots at the end of the last row may be unused.
        for row, col in coords:
            ax[row][col].axis('off')

        figure.suptitle(
            ('Per-sample read count for %d pathogen%s and %d sample%s.\n\n'
             'Sample name%s: %s') % (
                 len(pathogenNames),
                 '' if len(pathogenNames) == 1 else 's',
                 len(sampleNames),
                 '' if len(sampleNames) == 1 else 's',
                 '' if len(sampleNames) == 1 else 's',
                 fill(', '.join(sampleNames), 50)),
            fontsize=20)
        figure.set_size_inches(5.0 * cols, 2.0 * rows, forward=True)
        plt.subplots_adjust(hspace=0.4)

        figure.savefig(filename)
Beispiel #35
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter,
                taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex
                    and readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None
                             and length < minSequenceLen) or
                            (maxSequenceLen is not None
                             and length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None
                                 and hsp.readStartInSubject < minStart) or
                                (maxStop is not None
                                 and hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
Beispiel #36
0
    def _filter(self, minLength=None, maxLength=None, removeGaps=False,
                whitelist=None, blacklist=None,
                titleRegex=None, negativeTitleRegex=None,
                truncateTitlesAfter=None, indices=None, head=None,
                removeDuplicates=False, modifier=None, randomSubset=None,
                trueLength=None, sampleFraction=None,
                sequenceNumbersFile=None):
        """
        Filter a set of reads to produce a matching subset.

        See docstring for self.filter (above) for parameter docs.

        @return: A generator that yields C{Read} instances.
        """

        def _wantedSequences(filename):
            """
            Read and yield integer sequence numbers from a file.

            @raise ValueError: If the sequence numbers are not all positive or
                are not ascending.
            @return: A generator that yields C{int} sequence numbers.
            """
            with open(filename) as fp:
                lastNumber = None
                for line in fp:
                    n = int(line)
                    if lastNumber is None:
                        if n < 1:
                            raise ValueError(
                                'First line of sequence number file %r must '
                                'be at least 1.' % filename)
                        lastNumber = n
                        yield n
                    else:
                        if n > lastNumber:
                            lastNumber = n
                            yield n
                        else:
                            raise ValueError(
                                'Line number file %r contains non-ascending '
                                'numbers %d and %d.' %
                                (filename, lastNumber, n))

        if randomSubset is not None and sampleFraction is not None:
            raise ValueError('randomSubset and sampleFraction cannot be '
                             'used simultaneously in a filter. Call filter '
                             'twice instead.')

        if sequenceNumbersFile is None:
            nextWantedSequenceNumber = None
            wantedSequenceNumberGeneratorExhausted = False
        else:
            wantedSequenceNumerGenerator = _wantedSequences(
                sequenceNumbersFile)
            try:
                nextWantedSequenceNumber = next(wantedSequenceNumerGenerator)
            except StopIteration:
                # There was a sequence number file, but it was empty.
                return
            else:
                wantedSequenceNumberGeneratorExhausted = False

        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if removeDuplicates:
            sequencesSeen = set()

        if sampleFraction is not None:
            if sampleFraction == 0.0:
                # The filter returns nothing.
                return
            elif sampleFraction == 1.0:
                # Passing 1.0 can be treated the same as passing no value.
                # This makes the loop code simpler.
                sampleFraction = None

        if randomSubset is not None and trueLength is None:
            trueLength = self._length

        yieldCount = 0

        for readIndex, read in enumerate(self):

            if wantedSequenceNumberGeneratorExhausted:
                return

            if nextWantedSequenceNumber is not None:
                if readIndex + 1 == nextWantedSequenceNumber:
                    # We want this sequence.
                    try:
                        nextWantedSequenceNumber = next(
                            wantedSequenceNumerGenerator)
                    except StopIteration:
                        # The sequence number iterator ran out of sequence
                        # numbers.  We must let the rest of the filtering
                        # continue for the current sequence in case we
                        # throw it out for other reasons (as we might have
                        # done for any of the earlier wanted sequence
                        # numbers).
                        wantedSequenceNumberGeneratorExhausted = True
                else:
                    # This sequence isn't wanted.
                    continue

            if (sampleFraction is not None and
                    uniform(0.0, 1.0) > sampleFraction):
                # Note that we don't have to worry about the 0.0 or 1.0
                # cases in the above if, as they have been dealt with
                # before the loop.
                continue

            if randomSubset is not None:
                if yieldCount == randomSubset:
                    # The random subset has already been fully returned.
                    # There's no point in going any further through the input.
                    return
                elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) /
                                          (trueLength - readIndex)):
                    continue

            if head is not None and readIndex == head:
                # We're completely done.
                return

            readLen = len(read)
            if ((minLength is not None and readLen < minLength) or
                    (maxLength is not None and readLen > maxLength)):
                continue

            if removeGaps:
                sequence = read.sequence.replace('-', '')
                read = read.__class__(read.id, sequence, read.quality)

            if (titleFilter and
                    titleFilter.accept(read.id) == TitleFilter.REJECT):
                continue

            if indices is not None and readIndex not in indices:
                continue

            if removeDuplicates:
                if read.sequence in sequencesSeen:
                    continue
                sequencesSeen.add(read.sequence)

            if modifier:
                modified = modifier(read)
                if modified is None:
                    continue
                else:
                    read = modified

            yield read
            yieldCount += 1
Beispiel #37
0
    def filter(self, minLength=None, maxLength=None, removeGaps=False,
               whitelist=None, blacklist=None,
               titleRegex=None, negativeTitleRegex=None,
               truncateTitlesAfter=None, indices=None, head=None,
               removeDuplicates=False, modifier=None):
        """
        Filter a set of reads to produce a matching subset.

        Note: there are many additional filtering options that could be added,
        e.g., filtering on read id (whitelist, blacklist, regex, etc), GC %,
        and quality.

        @param minLength: The minimum acceptable length.
        @param maxLength: The maximum acceptable length.
        @param removeGaps: If C{True} remove all gaps ('-' characters) from the
            read sequences.
        @param whitelist: If not C{None}, a set of exact read ids that are
            always acceptable (though other characteristics, such as length,
            of a whitelisted id may rule it out).
        @param blacklist: If not C{None}, a set of exact read ids that are
            never acceptable.
        @param titleRegex: A regex that read ids must match.
        @param negativeTitleRegex: A regex that read ids must not match.
        @param truncateTitlesAfter: A string that read ids will be truncated
            beyond. If the truncated version of an id has already been seen,
            that sequence will be skipped.
        @param indices: Either C{None} or a set of C{int} indices corresponding
            to reads that are wanted. Indexing starts at zero.
        @param head: If not C{None}, the C{int} number of sequences at the
            start of the reads to return. Later sequences are skipped.
        @param removeDuplicates: If C{True} remove duplicated sequences.
        @param modifier: If not C{None} a function that is passed a read
            and which either returns a read or C{None}. If it returns a read,
            that read is passed through the filter. If it returns C{None},
            the read is omitted. Such a function can be used to do customized
            filtering, to change sequence ids, etc.
        @return: A generator that yields C{Read} instances.
        """
        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if removeDuplicates:
            sequencesSeen = set()

        for readIndex, read in enumerate(self):

            if head is not None and readIndex == head:
                # We're completely done.
                return

            readLen = len(read)
            if ((minLength is not None and readLen < minLength) or
                    (maxLength is not None and readLen > maxLength)):
                continue

            if removeGaps:
                sequence = read.sequence.replace('-', '')
                read = read.__class__(read.id, sequence, read.quality)

            if (titleFilter and
                    titleFilter.accept(read.id) == TitleFilter.REJECT):
                continue

            if indices is not None and readIndex not in indices:
                continue

            if removeDuplicates:
                if read.sequence in sequencesSeen:
                    continue
                sequencesSeen.add(read.sequence)

            if modifier:
                modified = modifier(read)
                if modified is None:
                    continue
                else:
                    read = modified

            yield read
Beispiel #38
0
    def _filter(self,
                minLength=None,
                maxLength=None,
                removeGaps=False,
                whitelist=None,
                blacklist=None,
                titleRegex=None,
                negativeTitleRegex=None,
                truncateTitlesAfter=None,
                indices=None,
                head=None,
                removeDuplicates=False,
                modifier=None,
                randomSubset=None,
                trueLength=None,
                sampleFraction=None,
                sequenceNumbersFile=None):
        """
        Filter a set of reads to produce a matching subset.

        See docstring for self.filter (above) for parameter docs.

        @return: A generator that yields C{Read} instances.
        """
        def _wantedSequences(filename):
            """
            Read and yield integer sequence numbers from a file.

            @raise ValueError: If the sequence numbers are not all positive or
                are not ascending.
            @return: A generator that yields C{int} sequence numbers.
            """
            with open(filename) as fp:
                lastNumber = None
                for line in fp:
                    n = int(line)
                    if lastNumber is None:
                        if n < 1:
                            raise ValueError(
                                'First line of sequence number file %r must '
                                'be at least 1.' % filename)
                        lastNumber = n
                        yield n
                    else:
                        if n > lastNumber:
                            lastNumber = n
                            yield n
                        else:
                            raise ValueError(
                                'Line number file %r contains non-ascending '
                                'numbers %d and %d.' %
                                (filename, lastNumber, n))

        if randomSubset is not None and sampleFraction is not None:
            raise ValueError('randomSubset and sampleFraction cannot be '
                             'used simultaneously in a filter. Call filter '
                             'twice instead.')

        if sequenceNumbersFile is None:
            nextWantedSequenceNumber = None
            wantedSequenceNumberGeneratorExhausted = False
        else:
            wantedSequenceNumerGenerator = _wantedSequences(
                sequenceNumbersFile)
            try:
                nextWantedSequenceNumber = next(wantedSequenceNumerGenerator)
            except StopIteration:
                # There was a sequence number file, but it was empty.
                return
            else:
                wantedSequenceNumberGeneratorExhausted = False

        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if removeDuplicates:
            sequencesSeen = set()

        if sampleFraction is not None:
            if sampleFraction == 0.0:
                # The filter returns nothing.
                return
            elif sampleFraction == 1.0:
                # Passing 1.0 can be treated the same as passing no value.
                # This makes the loop code simpler.
                sampleFraction = None

        if randomSubset is not None and trueLength is None:
            trueLength = self._length

        yieldCount = 0

        for readIndex, read in enumerate(self):

            if wantedSequenceNumberGeneratorExhausted:
                return

            if nextWantedSequenceNumber is not None:
                if readIndex + 1 == nextWantedSequenceNumber:
                    # We want this sequence.
                    try:
                        nextWantedSequenceNumber = next(
                            wantedSequenceNumerGenerator)
                    except StopIteration:
                        # The sequence number iterator ran out of sequence
                        # numbers.  We must let the rest of the filtering
                        # continue for the current sequence in case we
                        # throw it out for other reasons (as we might have
                        # done for any of the earlier wanted sequence
                        # numbers).
                        wantedSequenceNumberGeneratorExhausted = True
                else:
                    # This sequence isn't wanted.
                    continue

            if (sampleFraction is not None
                    and uniform(0.0, 1.0) > sampleFraction):
                # Note that we don't have to worry about the 0.0 or 1.0
                # cases in the above if, as they have been dealt with
                # before the loop.
                continue

            if randomSubset is not None:
                if yieldCount == randomSubset:
                    # The random subset has already been fully returned.
                    # There's no point in going any further through the input.
                    return
                elif uniform(0.0, 1.0) > ((randomSubset - yieldCount) /
                                          (trueLength - readIndex)):
                    continue

            if head is not None and readIndex == head:
                # We're completely done.
                return

            readLen = len(read)
            if ((minLength is not None and readLen < minLength)
                    or (maxLength is not None and readLen > maxLength)):
                continue

            if removeGaps:
                sequence = read.sequence.replace('-', '')
                read = read.__class__(read.id, sequence, read.quality)

            if (titleFilter
                    and titleFilter.accept(read.id) == TitleFilter.REJECT):
                continue

            if indices is not None and readIndex not in indices:
                continue

            if removeDuplicates:
                if read.sequence in sequencesSeen:
                    continue
                sequencesSeen.add(read.sequence)

            if modifier:
                modified = modifier(read)
                if modified is None:
                    continue
                else:
                    read = modified

            yield read
            yieldCount += 1