コード例 #1
0
    def testGetTaxonomy(self):
        """
        Test if the LineageFetcher class works properly.
        """
        title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus'

        db = FakeDbConnection([
            [15],
            ['Merkel cell polyomavirus'],
            [4],
            ['Polyomavirus'],
            [3],
            ['dsDNA viruses'],
            [2],
            ['Vira'],
            [1],
        ])
        cursor = db.cursor()

        lineageFetcher = LineageFetcher(db=db, cursor=cursor)

        lineage = lineageFetcher.lineage(title)
        self.assertEqual([
            (15, 'Merkel cell polyomavirus'),
            (4, 'Polyomavirus'),
            (3, 'dsDNA viruses'),
            (2, 'Vira'),
        ], lineage)
コード例 #2
0
ファイル: alignments.py プロジェクト: UdoGi/dark-matter
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 percentageIdenticalCutoff=None,
                 percentagePositiveCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff
        self.percentageIdenticalCutoff = percentageIdenticalCutoff
        self.percentagePositiveCutoff = percentagePositiveCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
コード例 #3
0
ファイル: test_taxonomy.py プロジェクト: acorg/dark-matter
    def testGetTaxonomy(self):
        """
        Test if the LineageFetcher class works properly.
        """
        title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus'

        db = FakeDbConnection([
            [15], ['Merkel cell polyomavirus'],
            [4], ['Polyomavirus'],
            [3], ['dsDNA viruses'],
            [2], ['Vira'],
            [1],
        ])
        cursor = db.cursor()

        lineageFetcher = LineageFetcher(db=db, cursor=cursor)

        lineage = lineageFetcher.lineage(title)
        self.assertEqual(
            [
                (15, 'Merkel cell polyomavirus'),
                (4, 'Polyomavirus'),
                (3, 'dsDNA viruses'),
                (2, 'Vira'),
            ],
            lineage)
コード例 #4
0
ファイル: alignments.py プロジェクト: acorg/dark-matter
    def __init__(self, limit=None, maxAlignmentsPerRead=None,
                 minSequenceLen=None, maxSequenceLen=None,
                 minStart=None, maxStop=None,
                 oneAlignmentPerRead=False, maxHspsPerHit=None,
                 scoreCutoff=None, whitelist=None, blacklist=None,
                 whitelistFile=None, blacklistFile=None,
                 titleRegex=None, negativeTitleRegex=None,
                 truncateTitlesAfter=None, taxonomy=None, readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile or
                titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                whitelistFile=whitelistFile, blacklistFile=blacklistFile,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
コード例 #5
0
        print('Could not compile %r to a regular expression:' % args.taxonomy,
              e, file=sys.stderr)
        sys.exit(1)

    if args.detailsFile is not None:
        detailsFp = open(args.detailsFile, 'w')

        def details(accept, readId, taxonomy):
            return writeDetails(accept, readId, taxonomy, detailsFp)
    else:
        detailsFp = None

        def details(accept, readId, taxonomy):
            return None

    lineageFetcher = LineageFetcher()
    reads = FastaReads(sys.stdin)
    save = sys.stdout.write
    readCount = saveCount = noTaxonomyCount = 0

    for read in reads:
        readCount += 1
        fasta = read.toString('fasta')
        taxonomy = lineageFetcher.lineage(read.id)
        if taxonomy:
            for taxonomyId, scientificName in taxonomy:
                if regexp.match(scientificName):
                    details(True, read.id, taxonomy)
                    if not args.invert:
                        saveCount += 1
                        save(fasta)
コード例 #6
0
ファイル: alignments.py プロジェクト: bamueh/dark-matter
class ReadsAlignmentsFilter(object):
    """
    Provide a filter for C{ReadsAlignments} instances.

    @param limit: An C{int} limit on the number of records to read.
    @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a
        read may have in order not to be filtered out. Reads with a greater
        number of alignments will be elided. Pass 0 to filter out reads that
        did not match (i.e., align to) any subjects. Use C{None} for no
        max alignments filtering.
    @param minSequenceLen: Sequences of lesser length will be elided.
    @param maxSequenceLen: Sequences of greater length will be elided.
    @param minStart: HSPs that start before this offset in the matched
        sequence should not be returned.
    @param maxStop: HSPs that end after this offset in the matched sequence
        should not be returned.
    @param oneAlignmentPerRead: If C{True}, only keep the best
        alignment for each read.
    @param maxHspsPerHit: The maximum number of HSPs to keep for each
        alignment for each read.
    @param scoreCutoff: A C{float} score. Matches with scores that are not
        better than this score will be ignored.
    @param whitelist: If not C{None}, a set of exact titles that are always
        acceptable (though the match info for a whitelist title may rule it
        out for other reasons).
    @param blacklist: If not C{None}, a set of exact titles that are never
        acceptable.
    @param titleRegex: A regex that sequence titles must match.
    @param negativeTitleRegex: A regex that sequence titles must not match.
    @param truncateTitlesAfter: A string that titles will be truncated
        beyond. If a truncated title has already been seen, that title will
        be elided.
    @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic
        group on which should be filtered. eg 'Vira' will filter on
        viruses, while 11118 will filter on Coronaviridae.
    @param readIdRegex: A case-sensitive regex C{str} that read ids must
        match.
    @return: C{self}.
    """
    def __init__(self, limit=None, maxAlignmentsPerRead=None,
                 minSequenceLen=None, maxSequenceLen=None,
                 minStart=None, maxStop=None,
                 oneAlignmentPerRead=False, maxHspsPerHit=None,
                 scoreCutoff=None, whitelist=None, blacklist=None,
                 titleRegex=None, negativeTitleRegex=None,
                 truncateTitlesAfter=None, taxonomy=None, readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            self.titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0

    def filter(self, readAlignments):
        """
        Filter a read's alignments.

        @param readAlignments: A C{ReadAlignments} instance.
        @return: A C{ReadAlignments} instance if the passed
            C{readAlignments} is not filtered out, else C{False}.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the desired filtering
        #    settings on each call the way the current code does.
        #
        # 3. A better approach with readIdRegex would be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #
        if self.limit is not None and self.count == self.limit:
            return False

        # Does the read have too many alignments?
        if (self.maxAlignmentsPerRead is not None and
                len(readAlignments) > self.maxAlignmentsPerRead):
            return False

        # Filter on the read id.
        if (self.readIdRegex and
                self.readIdRegex.search(readAlignments.read.id) is None):
            return False

        if self.titleFilter:
            # Remove alignments against sequences whose titles are
            # unacceptable.
            wantedAlignments = []
            for alignment in readAlignments:
                if (self.titleFilter.accept(alignment.subjectTitle) !=
                        TitleFilter.REJECT):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Only return alignments that are against sequences of the
        # desired length.
        minSequenceLen = self.minSequenceLen
        maxSequenceLen = self.maxSequenceLen
        if minSequenceLen is not None or maxSequenceLen is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                length = alignment.subjectLength
                if not ((minSequenceLen is not None and
                         length < minSequenceLen) or
                        (maxSequenceLen is not None and
                         length > self.maxSequenceLen)):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.taxonomy is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                lineage = self.lineageFetcher.lineage(alignment.subjectTitle)
                if lineage:
                    for taxonomyIdAndScientificName in lineage:
                        if self.taxonomy in taxonomyIdAndScientificName:
                            wantedAlignments.append(alignment)
                else:
                    # No lineage info was found. Keep the alignment
                    # since we can't rule it out.  We could add another
                    # option to control this.
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.oneAlignmentPerRead and readAlignments:
            readAlignments[:] = [bestAlignment(readAlignments)]

        #
        # From here on we do only HSP-based filtering.
        #

        # Throw out any unwanted HSPs due to maxHspsPerHit.
        if self.maxHspsPerHit is not None:
            for alignment in readAlignments:
                hsps = alignment.hsps
                if len(hsps) > self.maxHspsPerHit:
                    alignment.hsps = hsps[:self.maxHspsPerHit]

        # Throw out HSPs whose scores are not good enough.
        if self.scoreCutoff is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if hsp.betterThan(self.scoreCutoff):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Throw out HSPs that don't match in the desired place on the
        # matched sequence.
        minStart = self.minStart
        maxStop = self.maxStop
        if minStart is not None or maxStop is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if not ((minStart is not None and
                             hsp.readStartInSubject < minStart) or
                            (maxStop is not None and
                             hsp.readEndInSubject > maxStop)):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        self.count += 1
        return readAlignments

    def close(self):
        """
        Close our lineage fetcher, if any.
        """
        if self.taxonomy:
            self.lineageFetcher.close()
コード例 #7
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex,
                truncateTitlesAfter, taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex or
                truncateTitlesAfter):
            titleFilter = TitleFilter(
                whitelist=whitelist, blacklist=blacklist,
                positiveRegex=titleRegex, negativeRegex=negativeTitleRegex,
                truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex and
                    readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None and
                             length < minSequenceLen) or
                            (maxSequenceLen is not None and
                             length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None and
                                 hsp.readStartInSubject < minStart)
                                or (maxStop is not None and
                                    hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
コード例 #8
0
class ReadsAlignmentsFilter(object):
    """
    Provide a filter for C{ReadsAlignments} instances.

    @param limit: An C{int} limit on the number of records to read.
    @param maxAlignmentsPerRead: An C{int} limit on the number of alignments a
        read may have in order not to be filtered out. Reads with a greater
        number of alignments will be elided. Pass 0 to filter out reads that
        did not match (i.e., align to) any subjects. Use C{None} for no
        max alignments filtering.
    @param minSequenceLen: Sequences of lesser length will be elided.
    @param maxSequenceLen: Sequences of greater length will be elided.
    @param minStart: HSPs that start before this offset in the matched
        sequence should not be returned.
    @param maxStop: HSPs that end after this offset in the matched sequence
        should not be returned.
    @param oneAlignmentPerRead: If C{True}, only keep the best
        alignment for each read.
    @param maxHspsPerHit: The maximum number of HSPs to keep for each
        alignment for each read.
    @param scoreCutoff: A C{float} score. Matches with scores that are not
        better than this score will be ignored.
    @param whitelist: If not C{None}, a set of exact titles that are always
        acceptable (though the match info for a whitelist title may rule it
        out for other reasons).
    @param blacklist: If not C{None}, a set of exact titles that are never
        acceptable.
    @param whitelistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are always acceptable.
    @param blacklistFile: If not C{None}, a C{str} filename containing lines
        that give exact ids that are never acceptable.
    @param titleRegex: A regex that sequence titles must match.
    @param negativeTitleRegex: A regex that sequence titles must not match.
    @param truncateTitlesAfter: A string that titles will be truncated
        beyond. If a truncated title has already been seen, that title will
        be elided.
    @param taxonomy: Either a C{str} name or an C{int} id of the taxonomic
        group on which should be filtered. eg 'Vira' will filter on
        viruses, while 11118 will filter on Coronaviridae.
    @param readIdRegex: A case-sensitive regex C{str} that read ids must
        match.
    @return: C{self}.
    """
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0

    def filter(self, readAlignments):
        """
        Filter a read's alignments.

        @param readAlignments: A C{ReadAlignments} instance.
        @return: A C{ReadAlignments} instance if the passed
            C{readAlignments} is not filtered out, else C{False}.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the desired filtering
        #    settings on each call the way the current code does.
        #
        # 3. A better approach with readIdRegex would be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #
        if self.limit is not None and self.count == self.limit:
            return False

        # Does the read have too many alignments?
        if (self.maxAlignmentsPerRead is not None
                and len(readAlignments) > self.maxAlignmentsPerRead):
            return False

        # Filter on the read id.
        if (self.readIdRegex
                and self.readIdRegex.search(readAlignments.read.id) is None):
            return False

        if self.titleFilter:
            # Remove alignments against sequences whose titles are
            # unacceptable.
            wantedAlignments = []
            for alignment in readAlignments:
                if (self.titleFilter.accept(alignment.subjectTitle) !=
                        TitleFilter.REJECT):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Only return alignments that are against sequences of the
        # desired length.
        minSequenceLen = self.minSequenceLen
        maxSequenceLen = self.maxSequenceLen
        if minSequenceLen is not None or maxSequenceLen is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                length = alignment.subjectLength
                if not (
                    (minSequenceLen is not None and length < minSequenceLen) or
                    (maxSequenceLen is not None
                     and length > self.maxSequenceLen)):
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.taxonomy is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                lineage = self.lineageFetcher.lineage(alignment.subjectTitle)
                if lineage:
                    for taxonomyIdAndScientificName in lineage:
                        if self.taxonomy in taxonomyIdAndScientificName:
                            wantedAlignments.append(alignment)
                else:
                    # No lineage info was found. Keep the alignment
                    # since we can't rule it out.  We could add another
                    # option to control this.
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        if self.oneAlignmentPerRead and readAlignments:
            readAlignments[:] = [bestAlignment(readAlignments)]

        #
        # From here on we do only HSP-based filtering.
        #

        # Throw out any unwanted HSPs due to maxHspsPerHit.
        if self.maxHspsPerHit is not None:
            for alignment in readAlignments:
                hsps = alignment.hsps
                if len(hsps) > self.maxHspsPerHit:
                    alignment.hsps = hsps[:self.maxHspsPerHit]

        # Throw out HSPs whose scores are not good enough.
        if self.scoreCutoff is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if hsp.betterThan(self.scoreCutoff):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        # Throw out HSPs that don't match in the desired place on the
        # matched sequence.
        minStart = self.minStart
        maxStop = self.maxStop
        if minStart is not None or maxStop is not None:
            wantedAlignments = []
            for alignment in readAlignments:
                hsps = alignment.hsps
                wantedHsps = []
                for hsp in hsps:
                    if not ((minStart is not None
                             and hsp.readStartInSubject < minStart) or
                            (maxStop is not None
                             and hsp.readEndInSubject > maxStop)):
                        wantedHsps.append(hsp)
                if wantedHsps:
                    alignment.hsps = wantedHsps
                    wantedAlignments.append(alignment)
            if wantedAlignments:
                readAlignments[:] = wantedAlignments
            else:
                return False

        self.count += 1
        return readAlignments

    def close(self):
        """
        Close our lineage fetcher, if any.
        """
        if self.taxonomy:
            self.lineageFetcher.close()
コード例 #9
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter,
                taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex
                    and readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None
                             and length < minSequenceLen) or
                            (maxSequenceLen is not None
                             and length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None
                                 and hsp.readStartInSubject < minStart) or
                                (maxStop is not None
                                 and hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
コード例 #10
0
              e,
              file=sys.stderr)
        sys.exit(1)

    if args.detailsFile is not None:
        detailsFp = open(args.detailsFile, 'w')

        def details(accept, readId, taxonomy):
            return writeDetails(accept, readId, taxonomy, detailsFp)
    else:
        detailsFp = None

        def details(accept, readId, taxonomy):
            return None

    lineageFetcher = LineageFetcher()
    reads = FastaReads(sys.stdin)
    save = sys.stdout.write
    readCount = saveCount = noTaxonomyCount = 0

    for read in reads:
        readCount += 1
        fasta = read.toString('fasta')
        taxonomy = lineageFetcher.lineage(read.id)
        if taxonomy:
            for taxonomyId, scientificName in taxonomy:
                if regexp.match(scientificName):
                    details(True, read.id, taxonomy)
                    if not args.invert:
                        saveCount += 1
                        save(fasta)