def testUnparseableTitleWithSpecificDefault(self): """ When an unparseable title is passed, the default return value passed to NCBISequenceLinkURL must be returned. """ default = object() self.assertIs(default, NCBISequenceLinkURL('xxx', default))
def addFile(self, filename, fp): """ Read and record protein information for a sample. @param filename: A C{str} file name. @param fp: An open file pointer to read the file's data from. """ if self._sampleNameRegex: match = self._sampleNameRegex.search(filename) if match: sampleName = match.group(1) else: sampleName = filename else: sampleName = filename outDir = join(dirname(filename), self._assetDir) self.sampleNames[sampleName] = join(outDir, 'index.html') for index, proteinLine in enumerate(fp): proteinLine = proteinLine[:-1] (coverage, medianScore, bestScore, readCount, hspCount, proteinLength, titles) = proteinLine.split(None, 6) match = self.VIRUS_RE.match(titles) if match: proteinTitle = match.group(1).strip() virusTitle = match.group(2) else: proteinTitle = titles virusTitle = self.NO_VIRUS_TITLE self.virusTitles[virusTitle][sampleName].append({ 'bestScore': float(bestScore), 'bluePlotFilename': join(outDir, '%d.png' % index), 'coverage': float(coverage), 'fastaFilename': join(outDir, '%d.fasta' % index), 'hspCount': int(hspCount), 'index': index, 'medianScore': float(medianScore), 'proteinLength': int(proteinLength), 'proteinTitle': proteinTitle, 'proteinURL': NCBISequenceLinkURL(proteinTitle), 'readCount': int(readCount), })
def alignmentPanel(titlesAlignments, sortOn='maxScore', idList=False, equalizeXAxes=False, xRange='subject', logLinearXAxis=False, rankScores=False, showFeatures=True, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE): """ Produces a rectangular panel of graphs that each contain an alignment graph against a given sequence. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param sortOn: The attribute to sort subplots on. Either "maxScore", "medianScore", "readCount", "length", or "title". @param idList: A dictionary. Keys are colors and values are lists of read ids that should be colored using that color. @param equalizeXAxes: If C{True}, adjust the X axis on each alignment plot to be the same. @param xRange: Set to either 'subject' or 'reads' to indicate the range of the X axis. @param logLinearXAxis: If C{True}, convert read offsets so that empty regions in the plots we're preparing will only be as wide as their logged actual values. @param logBase: The logarithm base to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the scores for the reads for each title to be their rank (worst to best). @param showFeatures: If C{True}, look online for features of the subject sequences. @raise ValueError: If C{outputDir} exists but is not a directory or if C{xRange} is not "subject" or "reads". """ if xRange not in ('subject', 'reads'): raise ValueError('xRange must be either "subject" or "reads".') start = time() titles = titlesAlignments.sortTitles(sortOn) cols = 5 rows = int(len(titles) / cols) + (0 if len(titles) % cols == 0 else 1) figure, ax = plt.subplots(rows, cols, squeeze=False) allGraphInfo = {} coords = dimensionalIterator((rows, cols)) report('Plotting %d titles in %dx%d grid, sorted on %s' % (len(titles), rows, cols, sortOn)) for i, title in enumerate(titles): titleAlignments = titlesAlignments[title] row, col = next(coords) report('%d: %s %s' % (i, title, NCBISequenceLinkURL(title, ''))) # Add a small plot to the alignment panel. graphInfo = alignmentGraph( titlesAlignments, title, addQueryLines=True, showFeatures=showFeatures, rankScores=rankScores, logLinearXAxis=logLinearXAxis, logBase=logBase, colorQueryBases=False, createFigure=False, showFigure=False, readsAx=ax[row][col], quiet=True, idList=idList, xRange=xRange, showOrfs=False) allGraphInfo[title] = graphInfo readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() # Make a short title for the small panel blue plot, ignoring any # leading NCBI gi / accession numbers. if title.startswith('gi|') and title.find(' ') > -1: shortTitle = title.split(' ', 1)[1][:40] else: shortTitle = title[:40] plotTitle = ('%d: %s\nLength %d, %d read%s, %d HSP%s.' % ( i, shortTitle, titleAlignments.subjectLength, readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's')) if hspCount: if rankScores: plotTitle += '\nY axis is ranked score' else: plotTitle += '\nmax %.2f, median %.2f' % ( titleAlignments.bestHsp().score.score, titleAlignments.medianScore()) ax[row][col].set_title(plotTitle, fontsize=10) maxX = max(graphInfo['maxX'] for graphInfo in allGraphInfo.values()) minX = min(graphInfo['minX'] for graphInfo in allGraphInfo.values()) maxY = max(graphInfo['maxY'] for graphInfo in allGraphInfo.values()) minY = min(graphInfo['minY'] for graphInfo in allGraphInfo.values()) # Post-process graphs to adjust axes, etc. coords = dimensionalIterator((rows, cols)) for title in titles: titleAlignments = titlesAlignments[title] row, col = next(coords) a = ax[row][col] a.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) if equalizeXAxes: a.set_xlim([minX, maxX]) a.set_yticks([]) a.set_xticks([]) if xRange == 'subject' and minX < 0: # Add a vertical line at x=0 so we can see the 'whiskers' of # reads that extend to the left of the sequence we're aligning # against. a.axvline(x=0, color='#cccccc') # Add a line on the right of each sub-plot so we can see where the # sequence ends (as all panel graphs have the same width and we # otherwise couldn't tell). sequenceLen = titleAlignments.subjectLength if logLinearXAxis: sequenceLen = allGraphInfo[title]['adjustOffset'](sequenceLen) a.axvline(x=sequenceLen, color='#cccccc') # Hide the final panel graphs (if any) that have no content. We do this # because the panel is a rectangular grid and some of the plots at the # end of the last row may be unused. for row, col in coords: ax[row][col].axis('off') # plt.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.93, # wspace=0.1, hspace=None) plt.subplots_adjust(hspace=0.4) figure.suptitle('X: %d to %d, Y (%s): %d to %d' % (minX, maxX, titlesAlignments.readsAlignments.params.scoreTitle, int(minY), int(maxY)), fontsize=20) figure.set_size_inches(5 * cols, 3 * rows, forward=True) figure.show() stop = time() report('Alignment panel generated in %.3f mins.' % ((stop - start) / 60.0))
def alignmentPanel(titlesAlignments, sortOn='maxScore', interactive=True, outputDir=None, idList=False, equalizeXAxes=False, xRange='subject', logLinearXAxis=False, logBase=DEFAULT_LOG_LINEAR_X_AXIS_BASE, rankScores=False, showFeatures=True): """ Produces a rectangular panel of graphs that each contain an alignment graph against a given sequence. @param titlesAlignments: A L{dark.titles.TitlesAlignments} instance. @param sortOn: The attribute to sort subplots on. Either "maxScore", "medianScore", "readCount", "length", or "title". @param interactive: If C{True}, we are interactive and should display the panel using figure.show etc. @param outputDir: If not None, specifies a directory to write an HTML summary to. If the directory does not exist it will be created. @param idList: a dictionary. Keys are colors and values are lists of read ids that should be colored using that color. @param equalizeXAxes: if C{True}, adjust the X axis on each alignment plot to be the same. @param xRange: set to either 'subject' or 'reads' to indicate the range of the X axis. @param logLinearXAxis: if C{True}, convert read offsets so that empty regions in the plots we're preparing will only be as wide as their logged actual values. @param logBase: The base of the logarithm to use if logLinearXAxis is C{True}. @param: rankScores: If C{True}, change the scores for the reads for each title to be their rank (worst to best). @param showFeatures: if C{True}, look online for features of the subject sequences. """ assert xRange in ('subject', 'reads'), ('xRange must be either "subject" or "reads".') if not (interactive or outputDir): raise ValueError('Either interactive or outputDir must be True') start = time() titles = titlesAlignments.sortTitles(sortOn) cols = 5 rows = int(len(titles) / cols) + (0 if len(titles) % cols == 0 else 1) figure, ax = plt.subplots(rows, cols, squeeze=False) if interactive: report('Plotting %d titles in %dx%d grid, sorted on %s' % (len(titles), rows, cols, sortOn)) allGraphInfo = {} if outputDir: if os.access(outputDir, os.F_OK): # outputDir exists. Check it's a directory. mode = os.stat(outputDir).st_mode assert S_ISDIR(mode), "%r is not a directory." % outputDir else: os.mkdir(outputDir) htmlOutput = AlignmentPanelHTML(outputDir, titlesAlignments) coords = dimensionalIterator((rows, cols)) for i, title in enumerate(titles): titleAlignments = titlesAlignments[title] row, col = next(coords) if interactive: print('%d: %s %s' % (i, title, NCBISequenceLinkURL(title, ''))) # If we are writing data to a file too, create a separate file with # a plot (this will be linked from the summary HTML). if outputDir: imageBasename = '%d.png' % i imageFile = '%s/%s' % (outputDir, imageBasename) graphInfo = alignmentGraph(titlesAlignments, title, addQueryLines=True, showFeatures=showFeatures, rankScores=rankScores, logLinearXAxis=logLinearXAxis, logBase=logBase, colorQueryBases=False, showFigure=False, imageFile=imageFile, quiet=True, idList=idList, xRange=xRange, showOrfs=True) # Close the image plot, otherwise it will be displayed if we # call plt.show below. plt.close() htmlOutput.addImage(imageBasename, title, graphInfo) # Add a small plot to the alignment panel. graphInfo = alignmentGraph(titlesAlignments, title, addQueryLines=True, showFeatures=showFeatures, rankScores=rankScores, logLinearXAxis=logLinearXAxis, logBase=logBase, colorQueryBases=False, createFigure=False, showFigure=False, readsAx=ax[row][col], quiet=True, idList=idList, xRange=xRange, showOrfs=False) allGraphInfo[title] = graphInfo readCount = titleAlignments.readCount() hspCount = titleAlignments.hspCount() plotTitle = ('%d: %s\nLength %d, %d read%s, %d HSP%s.' % (i, title.split(' ', 1)[1][:40], titleAlignments.subjectLength, readCount, '' if readCount == 1 else 's', hspCount, '' if hspCount == 1 else 's')) if hspCount: if rankScores: plotTitle += '\nY axis is ranked score' else: plotTitle += '\nmax %.2f, median %.2f' % ( titleAlignments.bestHsp().score.score, titleAlignments.medianScore()) ax[row][col].set_title(plotTitle, fontsize=10) maxX = max(graphInfo['maxX'] for graphInfo in allGraphInfo.values()) minX = min(graphInfo['minX'] for graphInfo in allGraphInfo.values()) maxY = max(graphInfo['maxY'] for graphInfo in allGraphInfo.values()) minY = min(graphInfo['minY'] for graphInfo in allGraphInfo.values()) # Post-process graphs to adjust axes, etc. coords = dimensionalIterator((rows, cols)) for title in titles: titleAlignments = titlesAlignments[title] row, col = next(coords) a = ax[row][col] a.set_ylim([0, int(maxY * Y_AXIS_UPPER_PADDING)]) if equalizeXAxes: a.set_xlim([minX, maxX]) a.set_yticks([]) a.set_xticks([]) if xRange == 'subject' and minX < 0: # Add a vertical line at x=0 so we can see the 'whiskers' of # reads that extend to the left of the sequence we're aligning # against. a.axvline(x=0, color='#cccccc') # Add a line on the right of each sub-plot so we can see where the # sequence ends (as all panel graphs have the same width and we # otherwise couldn't tell). sequenceLen = titleAlignments.subjectLength if logLinearXAxis: sequenceLen = allGraphInfo[title]['adjustOffset'](sequenceLen) a.axvline(x=sequenceLen, color='#cccccc') # Hide the final panel graphs (if any) that have no content. We do this # because the panel is a rectangular grid and some of the plots at the # end of the last row may be unused. for row, col in coords: ax[row][col].axis('off') # plt.subplots_adjust(left=0.01, bottom=0.01, right=0.99, top=0.93, # wspace=0.1, hspace=None) plt.subplots_adjust(hspace=0.4) figure.suptitle( 'X: %d to %d, Y (%s): %d to %d' % (minX, maxX, titlesAlignments.readsAlignments.params.scoreTitle, int(minY), int(maxY)), fontsize=20) figure.set_size_inches(5 * cols, 3 * rows, forward=True) if outputDir: panelFilename = 'alignment-panel.png' figure.savefig('%s/%s' % (outputDir, panelFilename)) htmlOutput.close(panelFilename) if interactive: figure.show() stop = time() if interactive: report('Alignment panel generated in %.3f mins.' % ((stop - start) / 60.0))
def _writeBody(self, fp): fp.write('<h1>Read alignments for %d matched subjects</h1>\n' % len(self._images)) # Write out an alignment panel as a table. cols = 6 fp.write('<table><tbody>\n') for i, image in enumerate(self._images): title = image['title'] accession = image['accession'] if i % cols == 0: fp.write('<tr>\n') fp.write( '<td><a id="small_%s"></a><a href="#big_%s"><img src="%s" ' 'class="thumbnail"/></a></td>\n' % (accession, accession, image['imageBasename'])) if i % cols == cols - 1: fp.write('</tr>') # Add empty cells to the final table row, and close the row, if # necessary. if i % cols < cols - 1: while i % cols < cols - 1: fp.write('<td> </td>\n') i += 1 fp.write('</tr>\n') fp.write('</tbody></table>\n') # Write out the full images with additional detail. for i, image in enumerate(self._images): title = image['title'] accession = image['accession'] titleAlignments = self._titlesAlignments[title] graphInfo = image['graphInfo'] readFormat = self._writeReads(image) fp.write(""" <a id="big_%s"></a> <h3>%d: %s</h3> <p> Length: %d. Read count: %d. HSP count: %d. <a href="%s.%s">%s</a>. <a href="#small_%s">Top panel.</a> """ % (accession, i, title, titleAlignments.subjectLength, titleAlignments.readCount(), titleAlignments.hspCount(), accession, readFormat, readFormat, accession)) url = NCBISequenceLinkURL(title) if url: fp.write('<a href="%s" target="_blank">NCBI</a>.' % url) # Write out feature information. if graphInfo['features'] is None: # Feature lookup was False (or we were offline). pass elif len(graphInfo['features']) == 0: fp.write('There were no features.') else: fp.write('<a href="%s">Features</a>' % self._writeFeatures(i, image)) # Write out the titles that this title invalidated due to its # read set. readSetFilter = self._titlesAlignments.readSetFilter if readSetFilter: invalidated = readSetFilter.invalidates(title) if invalidated: nInvalidated = len(invalidated) fp.write('<br/>This title invalidated %d other%s due to ' 'its read set:<ul>' % (nInvalidated, '' if nInvalidated == 1 else 's')) for title in invalidated: fp.write('<li>%s</li>' % title) fp.write('</ul>') fp.write('</p><img src="%s" class="full-size"/>' % image['imageBasename'])
def testAlternateDelimiter(self): title = 'gi+37955203+gb+AY253278.1+ H**o sapiens clone AL-11 HIV-1' self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/AY253278.1', NCBISequenceLinkURL(title, 3, '+'))
def testFieldNumber(self): title = 'gi|323924|gb|M15204.1|FCVMYCCA Feline leukemia virus myc gene' self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/M15204.1', NCBISequenceLinkURL(title, 3))
def testNoField(self): """ If no field is passed, the passed title must be used. """ self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/xxx', NCBISequenceLinkURL('xxx'))
def addFile(self, filename, fp): """ Read and record protein information for a sample. @param filename: A C{str} file name. @param fp: An open file pointer to read the file's data from. @raise ValueError: If information for a pathogen/protein/sample combination is given more than once. """ if self._sampleNameRegex: match = self._sampleNameRegex.search(filename) if match: sampleName = match.group(1) else: sampleName = filename else: sampleName = filename outDir = join(dirname(filename), self._assetDir) self.sampleNames[sampleName] = join(outDir, 'index.html') for index, proteinLine in enumerate(fp): proteinLine = proteinLine[:-1] (coverage, medianScore, bestScore, readCount, hspCount, proteinLength, names) = proteinLine.split(None, 6) proteinName, pathogenName = splitNames(names) if pathogenName not in self.pathogenNames: self.pathogenNames[pathogenName] = {} if sampleName not in self.pathogenNames[pathogenName]: self.pathogenNames[pathogenName][sampleName] = { 'proteins': {}, 'uniqueReadCount': None, } proteins = self.pathogenNames[pathogenName][sampleName]['proteins'] # We should only receive one line of information for a given # pathogen/sample/protein combination. if proteinName in proteins: raise ValueError( 'Protein %r already seen for pathogen %r sample %r.' % (proteinName, pathogenName, sampleName)) readsFilename = join(outDir, '%d.%s' % (index, self._format)) proteins[proteinName] = { 'bestScore': float(bestScore), 'bluePlotFilename': join(outDir, '%d.png' % index), 'coverage': float(coverage), 'readsFilename': readsFilename, 'hspCount': int(hspCount), 'index': index, 'medianScore': float(medianScore), 'outDir': outDir, 'proteinLength': int(proteinLength), 'proteinName': proteinName, 'proteinURL': NCBISequenceLinkURL(proteinName), 'readCount': int(readCount), } if self._saveReadLengths: readsClass = (FastaReads if self._format == 'fasta' else FastqReads) proteins[proteinName]['readLengths'] = tuple( len(read) for read in readsClass(readsFilename))
def testUnparseableTitleReturnsNone(self): """ When an unparseable title is passed, the default return value of NCBISequenceLinkURL is None. """ self.assertEqual(None, NCBISequenceLinkURL(''))
def testGenericTitle2(self): title = 'gi|37955203|gb|AY253278.1| H**o sapiens clone AL-11 HIV-1' self.assertEqual('http://www.ncbi.nlm.nih.gov/nuccore/AY253278', NCBISequenceLinkURL(title))
def addFile(self, filename, fp): """ Read and record protein information for a sample. @param filename: A C{str} file name. @param fp: An open file pointer to read the file's data from. @raise ValueError: If information for a pathogen/protein/sample combination is given more than once. """ if self._sampleName: sampleName = self._sampleName elif self._sampleNameRegex: match = self._sampleNameRegex.search(filename) if match: sampleName = match.group(1) else: sampleName = filename else: sampleName = filename outDir = join(dirname(filename), self._assetDir) self.sampleNames[sampleName] = join(outDir, 'index.html') for index, proteinLine in enumerate(fp): proteinLine = proteinLine[:-1] (coverage, medianScore, bestScore, readCount, hspCount, proteinLength, names) = proteinLine.split(None, 6) proteinName, pathogenName = splitNames(names) # Ignore pathogens with names we don't want. if (self.titleFilter and self.titleFilter.accept( pathogenName) == TitleFilter.REJECT): continue if pathogenName not in self.pathogenNames: self.pathogenNames[pathogenName] = {} if sampleName not in self.pathogenNames[pathogenName]: self.pathogenNames[pathogenName][sampleName] = { 'proteins': {}, 'uniqueReadCount': None, } proteins = self.pathogenNames[pathogenName][sampleName]['proteins'] # We should only receive one line of information for a given # pathogen/sample/protein combination. if proteinName in proteins: raise ValueError( 'Protein %r already seen for pathogen %r sample %r.' % (proteinName, pathogenName, sampleName)) readsFilename = join(outDir, '%d.%s' % (index, self._format)) if proteinName.count('|') < 5: # Assume this is an NCBI refseq id, like # YP_009137153.1 uracil glycosylase [Human alphaherpesvirus 2] # with a protein but not a genome accession. proteinURL = NCBISequenceLinkURL(proteinName, field=0, delim=' ') genomeURL = None else: # Assume this is an RVDB id, like # acc|GENBANK|ABJ91970.1|GENBANK|DQ876317|pol protein [HIV] # with both protein and genome accession numbers. proteinURL = NCBISequenceLinkURL(proteinName, field=2) genomeURL = NCBISequenceLinkURL(proteinName, field=4) proteinInfo = proteins[proteinName] = { 'bestScore': float(bestScore), 'bluePlotFilename': join(outDir, '%d.png' % index), 'coverage': float(coverage), 'readsFilename': readsFilename, 'hspCount': int(hspCount), 'index': index, 'medianScore': float(medianScore), 'outDir': outDir, 'proteinLength': int(proteinLength), 'proteinName': proteinName, 'proteinURL': proteinURL, 'genomeURL': genomeURL, 'readCount': int(readCount), } if proteinInfo['readCount'] == proteinInfo['hspCount']: proteinInfo['readAndHspCountStr'] = readCount else: proteinInfo['readAndHspCountStr'] = '%s%s%s' % ( readCount, self.READ_AND_HSP_COUNT_STR_SEP, hspCount) if self._saveReadLengths: readsClass = (FastaReads if self._format == 'fasta' else FastqReads) proteins[proteinName]['readLengths'] = tuple( len(read) for read in readsClass(readsFilename))