Esempio n. 1
0
 def testUnparseableTitleWithSpecificDefault(self):
     """
     When an unparseable title is passed, the default return value passed to
     NCBISequenceLinkURL must be returned.
     """
     default = object()
     self.assertIs(default, NCBISequenceLinkURL('xxx', default))
Esempio n. 2
0
    def addFile(self, filename, fp):
        """
        Read and record protein information for a sample.

        @param filename: A C{str} file name.
        @param fp: An open file pointer to read the file's data from.
        """

        if self._sampleNameRegex:
            match = self._sampleNameRegex.search(filename)
            if match:
                sampleName = match.group(1)
            else:
                sampleName = filename
        else:
            sampleName = filename

        outDir = join(dirname(filename), self._assetDir)

        self.sampleNames[sampleName] = join(outDir, 'index.html')

        for index, proteinLine in enumerate(fp):
            proteinLine = proteinLine[:-1]
            (coverage, medianScore, bestScore, readCount, hspCount,
             proteinLength, titles) = proteinLine.split(None, 6)

            match = self.VIRUS_RE.match(titles)
            if match:
                proteinTitle = match.group(1).strip()
                virusTitle = match.group(2)
            else:
                proteinTitle = titles
                virusTitle = self.NO_VIRUS_TITLE

            self.virusTitles[virusTitle][sampleName].append({
                'bestScore':
                float(bestScore),
                'bluePlotFilename':
                join(outDir, '%d.png' % index),
                'coverage':
                float(coverage),
                'fastaFilename':
                join(outDir, '%d.fasta' % index),
                'hspCount':
                int(hspCount),
                'index':
                index,
                'medianScore':
                float(medianScore),
                'proteinLength':
                int(proteinLength),
                'proteinTitle':
                proteinTitle,
                'proteinURL':
                NCBISequenceLinkURL(proteinTitle),
                'readCount':
                int(readCount),
            })
Esempio n. 3
0
def alignmentPanel(titlesAlignments, sortOn='maxScore', idList=False,
                   equalizeXAxes=False, xRange='subject', logLinearXAxis=False,
                   rankScores=False, showFeatures=True,
                   logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE):
    """
    Produces a rectangular panel of graphs that each contain an alignment graph
    against a given sequence.

    @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance.
    @param sortOn: The attribute to sort subplots on. Either "maxScore",
        "medianScore", "readCount", "length", or "title".
    @param idList: A dictionary. Keys are colors and values are lists of read
        ids that should be colored using that color.
    @param equalizeXAxes: If C{True}, adjust the X axis on each alignment plot
        to be the same.
    @param xRange: Set to either 'subject' or 'reads' to indicate the range of
        the X axis.
    @param logLinearXAxis: If C{True}, convert read offsets so that empty
        regions in the plots we're preparing will only be as wide as their
        logged actual values.
    @param logBase: The logarithm base to use if logLinearXAxis is C{True}.
    @param: rankScores: If C{True}, change the scores for the reads for each
        title to be their rank (worst to best).
    @param showFeatures: If C{True}, look online for features of the subject
        sequences.
    @raise ValueError: If C{outputDir} exists but is not a directory or if
        C{xRange} is not "subject" or "reads".
    """

    if xRange not in ('subject', 'reads'):
        raise ValueError('xRange must be either "subject" or "reads".')

    start = time()
    titles = titlesAlignments.sortTitles(sortOn)
    cols = 5
    rows = int(len(titles) / cols) + (0 if len(titles) % cols == 0 else 1)
    figure, ax = plt.subplots(rows, cols, squeeze=False)
    allGraphInfo = {}
    coords = dimensionalIterator((rows, cols))

    report('Plotting %d titles in %dx%d grid, sorted on %s' %
           (len(titles), rows, cols, sortOn))

    for i, title in enumerate(titles):
        titleAlignments = titlesAlignments[title]
        row, col = next(coords)
        report('%d: %s %s' % (i, title, NCBISequenceLinkURL(title, '')))

        # Add a small plot to the alignment panel.
        graphInfo = alignmentGraph(
            titlesAlignments, title, addQueryLines=True,
            showFeatures=showFeatures, rankScores=rankScores,
            logLinearXAxis=logLinearXAxis, logBase=logBase,
            colorQueryBases=False, createFigure=False, showFigure=False,
            readsAx=ax[row][col], quiet=True, idList=idList, xRange=xRange,
            showOrfs=False)

        allGraphInfo[title] = graphInfo
        readCount = titleAlignments.readCount()
        hspCount = titleAlignments.hspCount()

        # Make a short title for the small panel blue plot, ignoring any
        # leading NCBI gi / accession numbers.
        if title.startswith('gi|') and title.find(' ') > -1:
            shortTitle = title.split(' ', 1)[1][:40]
        else:
            shortTitle = title[:40]

        plotTitle = ('%d: %s\nLength %d, %d read%s, %d HSP%s.' % (
            i, shortTitle, titleAlignments.subjectLength,
            readCount, '' if readCount == 1 else 's',
            hspCount, '' if hspCount == 1 else 's'))

        if hspCount:
            if rankScores:
                plotTitle += '\nY axis is ranked score'
            else:
                plotTitle += '\nmax %.2f, median %.2f' % (
                    titleAlignments.bestHsp().score.score,
                    titleAlignments.medianScore())

        ax[row][col].set_title(plotTitle, fontsize=10)

    maxX = max(graphInfo['maxX'] for graphInfo in allGraphInfo.values())
    minX = min(graphInfo['minX'] for graphInfo in allGraphInfo.values())
    maxY = max(graphInfo['maxY'] for graphInfo in allGraphInfo.values())
    minY = min(graphInfo['minY'] for graphInfo in allGraphInfo.values())

    # Post-process graphs to adjust axes, etc.

    coords = dimensionalIterator((rows, cols))
    for title in titles:
        titleAlignments = titlesAlignments[title]
        row, col = next(coords)
        a = ax[row][col]
        a.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)])
        if equalizeXAxes:
            a.set_xlim([minX, maxX])
        a.set_yticks([])
        a.set_xticks([])

        if xRange == 'subject' and minX < 0:
            # Add a vertical line at x=0 so we can see the 'whiskers' of
            # reads that extend to the left of the sequence we're aligning
            # against.
            a.axvline(x=0, color='#cccccc')

        # Add a line on the right of each sub-plot so we can see where the
        # sequence ends (as all panel graphs have the same width and we
        # otherwise couldn't tell).
        sequenceLen = titleAlignments.subjectLength
        if logLinearXAxis:
            sequenceLen = allGraphInfo[title]['adjustOffset'](sequenceLen)
        a.axvline(x=sequenceLen, color='#cccccc')

    # Hide the final panel graphs (if any) that have no content. We do this
    # because the panel is a rectangular grid and some of the plots at the
    # end of the last row may be unused.
    for row, col in coords:
        ax[row][col].axis('off')

    # plt.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.93,
    # wspace=0.1, hspace=None)
    plt.subplots_adjust(hspace=0.4)
    figure.suptitle('X: %d to %d, Y (%s): %d to %d' %
                    (minX, maxX,
                     titlesAlignments.readsAlignments.params.scoreTitle,
                     int(minY), int(maxY)), fontsize=20)
    figure.set_size_inches(5 * cols, 3 * rows, forward=True)
    figure.show()
    stop = time()
    report('Alignment panel generated in %.3f mins.' % ((stop - start) / 60.0))
Esempio n. 4
0
def alignmentPanel(titlesAlignments,
                   sortOn='maxScore',
                   interactive=True,
                   outputDir=None,
                   idList=False,
                   equalizeXAxes=False,
                   xRange='subject',
                   logLinearXAxis=False,
                   logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE,
                   rankScores=False,
                   showFeatures=True):
    """
    Produces a rectangular panel of graphs that each contain an alignment graph
    against a given sequence.

    @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance.
    @param sortOn: The attribute to sort subplots on. Either "maxScore",
        "medianScore", "readCount", "length", or "title".
    @param interactive: If C{True}, we are interactive and should display the
        panel using figure.show etc.
    @param outputDir: If not None, specifies a directory to write an HTML
        summary to. If the directory does not exist it will be created.
    @param idList: a dictionary. Keys are colors and values are lists of read
        ids that should be colored using that color.
    @param equalizeXAxes: if C{True}, adjust the X axis on each alignment plot
        to be the same.
    @param xRange: set to either 'subject' or 'reads' to indicate the range of
        the X axis.
    @param logLinearXAxis: if C{True}, convert read offsets so that empty
        regions in the plots we're preparing will only be as wide as their
        logged actual values.
    @param logBase: The base of the logarithm to use if logLinearXAxis is
        C{True}.
    @param: rankScores: If C{True}, change the scores for the reads for each
        title to be their rank (worst to best).
    @param showFeatures: if C{True}, look online for features of the subject
        sequences.
    """

    assert xRange in ('subject',
                      'reads'), ('xRange must be either "subject" or "reads".')

    if not (interactive or outputDir):
        raise ValueError('Either interactive or outputDir must be True')

    start = time()
    titles = titlesAlignments.sortTitles(sortOn)
    cols = 5
    rows = int(len(titles) / cols) + (0 if len(titles) % cols == 0 else 1)
    figure, ax = plt.subplots(rows, cols, squeeze=False)
    if interactive:
        report('Plotting %d titles in %dx%d grid, sorted on %s' %
               (len(titles), rows, cols, sortOn))
    allGraphInfo = {}

    if outputDir:
        if os.access(outputDir, os.F_OK):
            # outputDir exists. Check it's a directory.
            mode = os.stat(outputDir).st_mode
            assert S_ISDIR(mode), "%r is not a directory." % outputDir
        else:
            os.mkdir(outputDir)
        htmlOutput = AlignmentPanelHTML(outputDir, titlesAlignments)

    coords = dimensionalIterator((rows, cols))

    for i, title in enumerate(titles):
        titleAlignments = titlesAlignments[title]
        row, col = next(coords)
        if interactive:
            print('%d: %s %s' % (i, title, NCBISequenceLinkURL(title, '')))

        # If we are writing data to a file too, create a separate file with
        # a plot (this will be linked from the summary HTML).
        if outputDir:
            imageBasename = '%d.png' % i
            imageFile = '%s/%s' % (outputDir, imageBasename)
            graphInfo = alignmentGraph(titlesAlignments,
                                       title,
                                       addQueryLines=True,
                                       showFeatures=showFeatures,
                                       rankScores=rankScores,
                                       logLinearXAxis=logLinearXAxis,
                                       logBase=logBase,
                                       colorQueryBases=False,
                                       showFigure=False,
                                       imageFile=imageFile,
                                       quiet=True,
                                       idList=idList,
                                       xRange=xRange,
                                       showOrfs=True)

            # Close the image plot, otherwise it will be displayed if we
            # call plt.show below.
            plt.close()
            htmlOutput.addImage(imageBasename, title, graphInfo)

        # Add a small plot to the alignment panel.
        graphInfo = alignmentGraph(titlesAlignments,
                                   title,
                                   addQueryLines=True,
                                   showFeatures=showFeatures,
                                   rankScores=rankScores,
                                   logLinearXAxis=logLinearXAxis,
                                   logBase=logBase,
                                   colorQueryBases=False,
                                   createFigure=False,
                                   showFigure=False,
                                   readsAx=ax[row][col],
                                   quiet=True,
                                   idList=idList,
                                   xRange=xRange,
                                   showOrfs=False)

        allGraphInfo[title] = graphInfo
        readCount = titleAlignments.readCount()
        hspCount = titleAlignments.hspCount()
        plotTitle = ('%d: %s\nLength %d, %d read%s, %d HSP%s.' %
                     (i, title.split(' ', 1)[1][:40],
                      titleAlignments.subjectLength, readCount, '' if readCount
                      == 1 else 's', hspCount, '' if hspCount == 1 else 's'))

        if hspCount:
            if rankScores:
                plotTitle += '\nY axis is ranked score'
            else:
                plotTitle += '\nmax %.2f, median %.2f' % (
                    titleAlignments.bestHsp().score.score,
                    titleAlignments.medianScore())

        ax[row][col].set_title(plotTitle, fontsize=10)

    maxX = max(graphInfo['maxX'] for graphInfo in allGraphInfo.values())
    minX = min(graphInfo['minX'] for graphInfo in allGraphInfo.values())
    maxY = max(graphInfo['maxY'] for graphInfo in allGraphInfo.values())
    minY = min(graphInfo['minY'] for graphInfo in allGraphInfo.values())

    # Post-process graphs to adjust axes, etc.

    coords = dimensionalIterator((rows, cols))
    for title in titles:
        titleAlignments = titlesAlignments[title]
        row, col = next(coords)
        a = ax[row][col]
        a.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)])
        if equalizeXAxes:
            a.set_xlim([minX, maxX])
        a.set_yticks([])
        a.set_xticks([])

        if xRange == 'subject' and minX < 0:
            # Add a vertical line at x=0 so we can see the 'whiskers' of
            # reads that extend to the left of the sequence we're aligning
            # against.
            a.axvline(x=0, color='#cccccc')

        # Add a line on the right of each sub-plot so we can see where the
        # sequence ends (as all panel graphs have the same width and we
        # otherwise couldn't tell).
        sequenceLen = titleAlignments.subjectLength
        if logLinearXAxis:
            sequenceLen = allGraphInfo[title]['adjustOffset'](sequenceLen)
        a.axvline(x=sequenceLen, color='#cccccc')

    # Hide the final panel graphs (if any) that have no content. We do this
    # because the panel is a rectangular grid and some of the plots at the
    # end of the last row may be unused.
    for row, col in coords:
        ax[row][col].axis('off')

    # plt.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.93,
    # wspace=0.1, hspace=None)
    plt.subplots_adjust(hspace=0.4)
    figure.suptitle(
        'X: %d to %d, Y (%s): %d to %d' %
        (minX, maxX, titlesAlignments.readsAlignments.params.scoreTitle,
         int(minY), int(maxY)),
        fontsize=20)
    figure.set_size_inches(5 * cols, 3 * rows, forward=True)
    if outputDir:
        panelFilename = 'alignment-panel.png'
        figure.savefig('%s/%s' % (outputDir, panelFilename))
        htmlOutput.close(panelFilename)
    if interactive:
        figure.show()
    stop = time()
    if interactive:
        report('Alignment panel generated in %.3f mins.' %
               ((stop - start) / 60.0))
Esempio n. 5
0
    def _writeBody(self, fp):
        fp.write('<h1>Read alignments for %d matched subjects</h1>\n' %
                 len(self._images))

        # Write out an alignment panel as a table.
        cols = 6
        fp.write('<table><tbody>\n')

        for i, image in enumerate(self._images):
            title = image['title']
            accession = image['accession']
            if i % cols == 0:
                fp.write('<tr>\n')

            fp.write(
                '<td><a id="small_%s"></a><a href="#big_%s"><img src="%s" '
                'class="thumbnail"/></a></td>\n' %
                (accession, accession, image['imageBasename']))

            if i % cols == cols - 1:
                fp.write('</tr>')

        # Add empty cells to the final table row, and close the row, if
        # necessary.
        if i % cols < cols - 1:
            while i % cols < cols - 1:
                fp.write('<td>&nbsp;</td>\n')
                i += 1
            fp.write('</tr>\n')

        fp.write('</tbody></table>\n')

        # Write out the full images with additional detail.
        for i, image in enumerate(self._images):
            title = image['title']
            accession = image['accession']
            titleAlignments = self._titlesAlignments[title]
            graphInfo = image['graphInfo']
            readFormat = self._writeReads(image)
            fp.write("""
      <a id="big_%s"></a>
      <h3>%d: %s</h3>
      <p>
            Length: %d.
            Read count: %d.
            HSP count: %d.
            <a href="%s.%s">%s</a>.
            <a href="#small_%s">Top panel.</a>
""" % (accession, i, title, titleAlignments.subjectLength,
            titleAlignments.readCount(), titleAlignments.hspCount(), accession,
            readFormat, readFormat, accession))

            url = NCBISequenceLinkURL(title)
            if url:
                fp.write('<a href="%s" target="_blank">NCBI</a>.' % url)

            # Write out feature information.
            if graphInfo['features'] is None:
                # Feature lookup was False (or we were offline).
                pass
            elif len(graphInfo['features']) == 0:
                fp.write('There were no features.')
            else:
                fp.write('<a href="%s">Features</a>' %
                         self._writeFeatures(i, image))

            # Write out the titles that this title invalidated due to its
            # read set.
            readSetFilter = self._titlesAlignments.readSetFilter
            if readSetFilter:
                invalidated = readSetFilter.invalidates(title)
                if invalidated:
                    nInvalidated = len(invalidated)
                    fp.write('<br/>This title invalidated %d other%s due to '
                             'its read set:<ul>' %
                             (nInvalidated, '' if nInvalidated == 1 else 's'))
                    for title in invalidated:
                        fp.write('<li>%s</li>' % title)
                    fp.write('</ul>')

            fp.write('</p><img src="%s" class="full-size"/>' %
                     image['imageBasename'])
Esempio n. 6
0
 def testAlternateDelimiter(self):
     title = 'gi+37955203+gb+AY253278.1+ H**o sapiens clone AL-11 HIV-1'
     self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/AY253278.1',
                      NCBISequenceLinkURL(title, 3, '+'))
Esempio n. 7
0
 def testFieldNumber(self):
     title = 'gi|323924|gb|M15204.1|FCVMYCCA Feline leukemia virus myc gene'
     self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/M15204.1',
                      NCBISequenceLinkURL(title, 3))
Esempio n. 8
0
 def testNoField(self):
     """
     If no field is passed, the passed title must be used.
     """
     self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/xxx',
                      NCBISequenceLinkURL('xxx'))
Esempio n. 9
0
    def addFile(self, filename, fp):
        """
        Read and record protein information for a sample.

        @param filename: A C{str} file name.
        @param fp: An open file pointer to read the file's data from.
        @raise ValueError: If information for a pathogen/protein/sample
            combination is given more than once.
        """
        if self._sampleNameRegex:
            match = self._sampleNameRegex.search(filename)
            if match:
                sampleName = match.group(1)
            else:
                sampleName = filename
        else:
            sampleName = filename

        outDir = join(dirname(filename), self._assetDir)

        self.sampleNames[sampleName] = join(outDir, 'index.html')

        for index, proteinLine in enumerate(fp):
            proteinLine = proteinLine[:-1]
            (coverage, medianScore, bestScore, readCount, hspCount,
             proteinLength, names) = proteinLine.split(None, 6)

            proteinName, pathogenName = splitNames(names)

            if pathogenName not in self.pathogenNames:
                self.pathogenNames[pathogenName] = {}

            if sampleName not in self.pathogenNames[pathogenName]:
                self.pathogenNames[pathogenName][sampleName] = {
                    'proteins': {},
                    'uniqueReadCount': None,
                }

            proteins = self.pathogenNames[pathogenName][sampleName]['proteins']

            # We should only receive one line of information for a given
            # pathogen/sample/protein combination.
            if proteinName in proteins:
                raise ValueError(
                    'Protein %r already seen for pathogen %r sample %r.' %
                    (proteinName, pathogenName, sampleName))

            readsFilename = join(outDir, '%d.%s' % (index, self._format))

            proteins[proteinName] = {
                'bestScore': float(bestScore),
                'bluePlotFilename': join(outDir, '%d.png' % index),
                'coverage': float(coverage),
                'readsFilename': readsFilename,
                'hspCount': int(hspCount),
                'index': index,
                'medianScore': float(medianScore),
                'outDir': outDir,
                'proteinLength': int(proteinLength),
                'proteinName': proteinName,
                'proteinURL': NCBISequenceLinkURL(proteinName),
                'readCount': int(readCount),
            }

            if self._saveReadLengths:
                readsClass = (FastaReads if self._format == 'fasta'
                              else FastqReads)
                proteins[proteinName]['readLengths'] = tuple(
                    len(read) for read in readsClass(readsFilename))
Esempio n. 10
0
 def testUnparseableTitleReturnsNone(self):
     """
     When an unparseable title is passed, the default return value of
     NCBISequenceLinkURL is None.
     """
     self.assertEqual(None, NCBISequenceLinkURL(''))
Esempio n. 11
0
 def testGenericTitle2(self):
     title = 'gi|37955203|gb|AY253278.1| H**o sapiens clone AL-11 HIV-1'
     self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/AY253278',
                      NCBISequenceLinkURL(title))
Esempio n. 12
0
    def addFile(self, filename, fp):
        """
        Read and record protein information for a sample.

        @param filename: A C{str} file name.
        @param fp: An open file pointer to read the file's data from.
        @raise ValueError: If information for a pathogen/protein/sample
            combination is given more than once.
        """
        if self._sampleName:
            sampleName = self._sampleName
        elif self._sampleNameRegex:
            match = self._sampleNameRegex.search(filename)
            if match:
                sampleName = match.group(1)
            else:
                sampleName = filename
        else:
            sampleName = filename

        outDir = join(dirname(filename), self._assetDir)

        self.sampleNames[sampleName] = join(outDir, 'index.html')

        for index, proteinLine in enumerate(fp):
            proteinLine = proteinLine[:-1]
            (coverage, medianScore, bestScore, readCount, hspCount,
             proteinLength, names) = proteinLine.split(None, 6)

            proteinName, pathogenName = splitNames(names)

            # Ignore pathogens with names we don't want.
            if (self.titleFilter and self.titleFilter.accept(
                    pathogenName) == TitleFilter.REJECT):
                continue

            if pathogenName not in self.pathogenNames:
                self.pathogenNames[pathogenName] = {}

            if sampleName not in self.pathogenNames[pathogenName]:
                self.pathogenNames[pathogenName][sampleName] = {
                    'proteins': {},
                    'uniqueReadCount': None,
                }

            proteins = self.pathogenNames[pathogenName][sampleName]['proteins']

            # We should only receive one line of information for a given
            # pathogen/sample/protein combination.
            if proteinName in proteins:
                raise ValueError(
                    'Protein %r already seen for pathogen %r sample %r.' %
                    (proteinName, pathogenName, sampleName))

            readsFilename = join(outDir, '%d.%s' % (index, self._format))

            if proteinName.count('|') < 5:
                # Assume this is an NCBI refseq id, like
                # YP_009137153.1 uracil glycosylase [Human alphaherpesvirus 2]
                # with a protein but not a genome accession.
                proteinURL = NCBISequenceLinkURL(proteinName, field=0,
                                                 delim=' ')
                genomeURL = None
            else:
                # Assume this is an RVDB id, like
                # acc|GENBANK|ABJ91970.1|GENBANK|DQ876317|pol protein [HIV]
                # with both protein and genome accession numbers.
                proteinURL = NCBISequenceLinkURL(proteinName, field=2)
                genomeURL = NCBISequenceLinkURL(proteinName, field=4)

            proteinInfo = proteins[proteinName] = {
                'bestScore': float(bestScore),
                'bluePlotFilename': join(outDir, '%d.png' % index),
                'coverage': float(coverage),
                'readsFilename': readsFilename,
                'hspCount': int(hspCount),
                'index': index,
                'medianScore': float(medianScore),
                'outDir': outDir,
                'proteinLength': int(proteinLength),
                'proteinName': proteinName,
                'proteinURL': proteinURL,
                'genomeURL': genomeURL,
                'readCount': int(readCount),
            }

            if proteinInfo['readCount'] == proteinInfo['hspCount']:
                proteinInfo['readAndHspCountStr'] = readCount
            else:
                proteinInfo['readAndHspCountStr'] = '%s%s%s' % (
                    readCount, self.READ_AND_HSP_COUNT_STR_SEP, hspCount)

            if self._saveReadLengths:
                readsClass = (FastaReads if self._format == 'fasta'
                              else FastqReads)
                proteins[proteinName]['readLengths'] = tuple(
                    len(read) for read in readsClass(readsFilename))