コード例 #1
0
    def testGetTaxonomy(self):
        """
        Test if the LineageFetcher class works properly.
        """
        title = 'gi|5|gb|EU375804.1| Merkel cell polyomavirus'

        db = FakeDbConnection([
            [15],
            ['Merkel cell polyomavirus'],
            [4],
            ['Polyomavirus'],
            [3],
            ['dsDNA viruses'],
            [2],
            ['Vira'],
            [1],
        ])
        cursor = db.cursor()

        lineageFetcher = LineageFetcher(db=db, cursor=cursor)

        lineage = lineageFetcher.lineage(title)
        self.assertEqual([
            (15, 'Merkel cell polyomavirus'),
            (4, 'Polyomavirus'),
            (3, 'dsDNA viruses'),
            (2, 'Vira'),
        ], lineage)
コード例 #2
0
ファイル: alignments.py プロジェクト: UdoGi/dark-matter
    def __init__(self,
                 limit=None,
                 maxAlignmentsPerRead=None,
                 minSequenceLen=None,
                 maxSequenceLen=None,
                 minStart=None,
                 maxStop=None,
                 oneAlignmentPerRead=False,
                 maxHspsPerHit=None,
                 scoreCutoff=None,
                 percentageIdenticalCutoff=None,
                 percentagePositiveCutoff=None,
                 whitelist=None,
                 blacklist=None,
                 whitelistFile=None,
                 blacklistFile=None,
                 titleRegex=None,
                 negativeTitleRegex=None,
                 truncateTitlesAfter=None,
                 taxonomy=None,
                 readIdRegex=None):

        self.limit = limit
        self.maxAlignmentsPerRead = maxAlignmentsPerRead
        self.minSequenceLen = minSequenceLen
        self.maxSequenceLen = maxSequenceLen
        self.minStart = minStart
        self.maxStop = maxStop
        self.oneAlignmentPerRead = oneAlignmentPerRead
        self.maxHspsPerHit = maxHspsPerHit
        self.scoreCutoff = scoreCutoff
        self.percentageIdenticalCutoff = percentageIdenticalCutoff
        self.percentagePositiveCutoff = percentagePositiveCutoff

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or whitelistFile or blacklistFile
                or titleRegex or negativeTitleRegex or truncateTitlesAfter):
            self.titleFilter = TitleFilter(whitelist=whitelist,
                                           blacklist=blacklist,
                                           whitelistFile=whitelistFile,
                                           blacklistFile=blacklistFile,
                                           positiveRegex=titleRegex,
                                           negativeRegex=negativeTitleRegex,
                                           truncateAfter=truncateTitlesAfter)
        else:
            self.titleFilter = None

        if taxonomy is not None:
            self.lineageFetcher = LineageFetcher()
        else:
            self.lineageFetcher = None
        self.taxonomy = taxonomy

        if readIdRegex is None:
            self.readIdRegex = None
        else:
            self.readIdRegex = re.compile(readIdRegex)

        self.count = 0
コード例 #3
0
    def _filter(self, limit, minSequenceLen, maxSequenceLen, minStart, maxStop,
                oneAlignmentPerRead, maxHspsPerHit, scoreCutoff, whitelist,
                blacklist, titleRegex, negativeTitleRegex, truncateTitlesAfter,
                taxonomy, iteratorIndex, readIdRegex):
        """
        Filter the read alignments in self.

        Do not call this function directly, instead use self.filter (above).
        Argument defaults and descriptions (other than for iteratorIndex) are
        as in self.filter.

        @param iteratorIndex: An index into self._iterators. Calling the
            iterator function will return a generator that yields
            C{ReadAlignments} instances.
        @return: A generator that yields C{ReadAlignments} instances.
        """

        # Implementation notes:
        #
        # 1. The order in which we carry out the filtering actions can make
        #    a big difference in the result of this function. The current
        #    ordering is based on what seems reasonable - it may not be the
        #    best way to do things. E.g., if maxHspsPerHit is 1 and there
        #    is a title regex, which should we perform first?
        #
        #    We perform filtering based on alignment before that based on
        #    HSPs. That's because there's no point filtering all HSPs for
        #    an alignment that we end up throwing away anyhow.
        #
        # 2. This function could be made faster if it first looked at its
        #    arguments and dynamically created an acceptance function
        #    (taking a readAlignments as an argument). The acceptance
        #    function would run without examining the above arguments for
        #    each match the way the current code does.
        #
        # 3. A better approach with readIdRegex might be to allow the
        #    passing of a regex object. Then the caller would make the
        #    regex with whatever flags they liked (e.g., case insensitive).

        #
        # Alignment-only (i.e., non-HSP based) filtering.
        #

        # If we've been asked to filter on matched sequence titles in any way,
        # build a title filter.
        if (whitelist or blacklist or titleRegex or negativeTitleRegex
                or truncateTitlesAfter):
            titleFilter = TitleFilter(whitelist=whitelist,
                                      blacklist=blacklist,
                                      positiveRegex=titleRegex,
                                      negativeRegex=negativeTitleRegex,
                                      truncateAfter=truncateTitlesAfter)
        else:
            titleFilter = None

        if taxonomy is not None:
            lineageFetcher = LineageFetcher()

        if readIdRegex is not None:
            readIdRegex = re.compile(readIdRegex)

        count = 0
        for readAlignments in self._iterators[iteratorIndex]():
            if limit is not None and count == limit:
                return

            # Filter on the read id.
            if (readIdRegex
                    and readIdRegex.search(readAlignments.read.id) is None):
                continue

            if titleFilter:
                # Remove alignments against sequences whose titles are
                # unacceptable.
                wantedAlignments = []
                for alignment in readAlignments:
                    if (titleFilter.accept(alignment.subjectTitle) !=
                            TitleFilter.REJECT):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Only return alignments that are against sequences of the
            # desired length.
            if minSequenceLen is not None or maxSequenceLen is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    length = alignment.subjectLength
                    if not ((minSequenceLen is not None
                             and length < minSequenceLen) or
                            (maxSequenceLen is not None
                             and length > maxSequenceLen)):
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if taxonomy is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    lineage = lineageFetcher.lineage(alignment.subjectTitle)
                    if lineage:
                        for taxonomyIdAndScientificName in lineage:
                            if taxonomy in taxonomyIdAndScientificName:
                                wantedAlignments.append(alignment)
                    else:
                        # No lineage info was found. Keep the alignment
                        # since we can't rule it out.  We could add another
                        # option to control this.
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            if oneAlignmentPerRead and readAlignments:
                readAlignments[:] = [bestAlignment(readAlignments)]

            #
            # From here on we do only HSP-based filtering.
            #

            # Throw out any unwanted HSPs due to maxHspsPerHit.
            if maxHspsPerHit is not None:
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    if len(hsps) > maxHspsPerHit:
                        alignment.hsps = hsps[:maxHspsPerHit]

            # Throw out HSPs whose scores are not good enough.
            if scoreCutoff is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if hsp.betterThan(scoreCutoff):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            # Throw out HSPs that don't match in the desired place on the
            # matched sequence.
            if minStart is not None or maxStop is not None:
                wantedAlignments = []
                for alignment in readAlignments:
                    hsps = alignment.hsps
                    wantedHsps = []
                    for hsp in hsps:
                        if not ((minStart is not None
                                 and hsp.readStartInSubject < minStart) or
                                (maxStop is not None
                                 and hsp.readEndInSubject > maxStop)):
                            wantedHsps.append(hsp)
                    if wantedHsps:
                        alignment.hsps = wantedHsps
                        wantedAlignments.append(alignment)
                if wantedAlignments:
                    readAlignments[:] = wantedAlignments
                else:
                    continue

            yield readAlignments
            count += 1

        if taxonomy:
            lineageFetcher.close()
コード例 #4
0
              e,
              file=sys.stderr)
        sys.exit(1)

    if args.detailsFile is not None:
        detailsFp = open(args.detailsFile, 'w')

        def details(accept, readId, taxonomy):
            return writeDetails(accept, readId, taxonomy, detailsFp)
    else:
        detailsFp = None

        def details(accept, readId, taxonomy):
            return None

    lineageFetcher = LineageFetcher()
    reads = FastaReads(sys.stdin)
    save = sys.stdout.write
    readCount = saveCount = noTaxonomyCount = 0

    for read in reads:
        readCount += 1
        fasta = read.toString('fasta')
        taxonomy = lineageFetcher.lineage(read.id)
        if taxonomy:
            for taxonomyId, scientificName in taxonomy:
                if regexp.match(scientificName):
                    details(True, read.id, taxonomy)
                    if not args.invert:
                        saveCount += 1
                        save(fasta)