def _getReferenceIds(self, alignedReferences, referenceIds): """ Figure out which reference ids we can process. @param alignedReferences: A C{set} of C{str} reference ids found in the passed reference files. @param referenceIds: A C{list} of C{str} reference ids for which processing has specifically been requested, or C{None}. @return: A C{set} of C{str} reference ids to process. """ if referenceIds: # Specific reference ids were given. Check that each appears in # some alignment file and that we have a genome for each. Any # error here causes a message to stderr and exit. missing = set(referenceIds) - alignedReferences if missing: print( 'Alignments against the following reference id%s are not ' 'present in any alignment file:\n%s' % (s(len(missing)), '\n'.join(' %s' % id_ for id_ in sorted(missing))), file=sys.stderr) sys.exit(1) missing = set(referenceIds) - set(self.referenceGenomes) if missing: print('Reference id%s %s not present in any reference genome ' 'file.' % (s(len(missing)), commas(missing)), file=sys.stderr) sys.exit(1) else: # We weren't told which reference ids to specifically examine # the alignments of, so examine all available references # mentioned in any alignment file and that we also have a # genome for. Mention any references from alignment files that # we can't process due to lack of genome. missing = alignedReferences - set(self.referenceGenomes) if missing: self.report( 'No analysis will be performed on reference%s %s ' '(found in SAM/BAM alignment file(s) headers) because no ' 'corresponding reference genome was found.' % (s(len(missing)), commas(missing))) referenceIds = alignedReferences & set(self.referenceGenomes) if referenceIds: self.report('Examining %d reference%s: %s' % (len(referenceIds), s( len(referenceIds)), commas(referenceIds))) else: print( 'Nothing to do! No genome could be found for any aligned ' 'reference. Found reference%s: %s' % (s(len(alignedReferences)), commas(alignedReferences)), file=sys.stderr) sys.exit(1) return referenceIds
def _readReferenceGenomes(self, referenceGenomeFiles): """ Read reference genomes from files and check that any duplicates have identical sequences. @param referenceGenomeFiles: A C{list} of C{str} names of FASTA files containing reference genomes. @raise ValueError: If a reference genome is found in more than one file and the sequences are not identical. @return: A C{dict} keyed by C{str} sequence id with C{dark.Read} values holding reference genomes. """ result = {} seen = {} for filename in referenceGenomeFiles: for read in FastaReads(filename): id_ = read.id if id_ in seen: if result[id_].sequence != read.sequence: raise ValueError( 'Reference genome id %r was found in two files ' '(%r and %r) but with different sequences.' % (id_, seen[id_], filename)) else: seen[id_] = filename result[id_] = read self.report('Read %d reference genome%s:\n%s' % (len(result), s(len(result)), '\n'.join( ' %s' % id_ for id_ in sorted(result))), requiredVerbosityLevel=2) return result
def __str__(self): result = [ 'Cluster with %d read%s:' % (len(self.reads), s(len(self.reads))) ] for read in self.reads: result.append(' %s' % read) result.append(nucleotidesToStr(self.nucleotides, prefix=' ')) return '\n'.join(result)
def _removePreExistingTopLevelOutputDirFiles(self): """ Remove all pre-existing files from the top-level output directory. """ paths = list( map(str, chain(Path(self.outputDir).glob('result-summary.txt')))) if paths: self.report(' Removing %d pre-existing output file%s from ' 'top-level output directory %s.' % (len(paths), s(len(paths)), self.outputDir), requiredVerbosityLevel=2) list(map(unlink, paths))
def summarize(self, fp, count, componentOffsets, referenceSequence): """ Write out a summary of this consistent component. @param fp: The file pointer to write to. @param count: The C{int} number of this component. @param componentOffsets: The C{set} of offsets in this component. @param referenceSequence: The C{str} reference sequence. """ plural = s(len(self.reads)) print(' Component %d: %d read%s, covering %d offset%s' % (count, len(self.reads), plural, len( self.nucleotides), s(len(self.nucleotides))), file=fp) print(' Nucleotide counts for each offset:', file=fp) print(nucleotidesToStr(self.nucleotides, ' '), file=fp) print(' Consensus sequence: %s' % self.consensusSequence(componentOffsets, referenceSequence, fp), file=fp) print(' Read%s:' % plural, file=fp) for read in sorted(self.reads): print(' ', read, file=fp)
def plotBaseFrequencies(significantOffsets, baseCountAtOffset, readCountAtOffset, outfile, title=None, sampleName=None, valuesFile=None, minReads=5, homogeneousCutoff=0.9, sortOn=None, histogram=False, show=False, titleFontSize=12, axisFontSize=12): """ Plot sorted base frequencies at signifcant sites. """ subtitle = ( '<br>%d significant sites. Min %d read%s per site. ' '%.2f homogeneity cutoff.' % (len(significantOffsets), minReads, s(minReads), homogeneousCutoff)) if sortOn is None: title = title or 'Base frequencies (sorted)' _plotBaseFrequencies(significantOffsets, baseCountAtOffset, readCountAtOffset, outfile, title + subtitle, show, titleFontSize, axisFontSize) elif sortOn == 'max': title = title or 'Maximum base frequency' result = _plotSortedMaxBaseFrequencies( significantOffsets, baseCountAtOffset, readCountAtOffset, outfile, title + subtitle, histogram, show, titleFontSize, axisFontSize) else: assert sortOn == 'entropy', ('Unknown --sortOn value: %r' % sortOn) title = title or 'Base frequency entropy' result = _plotBaseFrequenciesEntropy(significantOffsets, baseCountAtOffset, readCountAtOffset, outfile, title + subtitle, histogram, show, titleFontSize, axisFontSize) if valuesFile: # The following will fail if sortOn is None (no result, above). with open(valuesFile, 'w') as fp: dump( { 'sampleName': sampleName, 'text': [text for _, text in result], 'values': [value for value, _ in result], }, fp)
def plotSignificantOffsets(fig, row, col, significantOffsets, genomeLength): """ Plot the genome offsets that are significant. """ n = len(significantOffsets) trace = go.Scatter(x=[i + 1 for i in significantOffsets], y=[1.0] * n, mode='markers', showlegend=False) fig.append_trace(trace, row, col) fig['layout']['annotations'][1]['text'] = ( '%d significant genome location%s' % (n, s(n))) fig['layout']['xaxis'].update({ 'range': (0, genomeLength + 1), })
def _removePreExistingReferenceDirFiles(self, directory): """ Remove all pre-existing files from the output directory for a particular reference sequence alignment. @param directory: The C{str} directory to examine. """ # This prevents us from doing a run that results in (say) 6 # component files and then later doing a run that results in # only 5 components and erroneously thinking that # component-6-2.fasta etc. are from the most recent run. paths = list( map( str, chain( Path(directory).glob('*.fasta'), Path(directory).glob('*.html'), Path(directory).glob('*.txt')))) if paths: self.report(' Removing %d pre-existing output file%s from %s ' 'directory.' % (len(paths), s(len(paths)), directory), requiredVerbosityLevel=2) list(map(unlink, paths))
def initialReferenceIdAnalysis(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ # Make sure this reference id is in this alignment file and if so # get its length (and check it's the same as the length of the # sequence given in the reference file). with samfile(alignmentFile) as sam: tid = sam.get_tid(referenceId) if tid == -1: # This referenceId is not in this alignment file. self.report(' Reference %s not in alignment file.' % referenceId) return else: genomeLength = sam.lengths[tid] # Sanity check. assert genomeLength == len(self.referenceGenomes[referenceId]) if self.plotSAM: filename = join(outputDir, 'reads.html') self.report(' Saving reads alignment plot to %s' % filename) plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}), filename, title=referenceId, jitter=0.45) alignedReads = [] samFilter = SAMFilter( alignmentFile, referenceIds={referenceId}, dropDuplicates=True, dropSupplementary=True, # dropSecondary=True, storeQueryIds=True) paddedSAM = PaddedSAM(samFilter) for query in paddedSAM.queries(addAlignment=True): assert len(query) == genomeLength alignedReads.append( AlignedRead(query.id, query.sequence, query.alignment)) # Sanity check that all aligned reads have different ids. This # should be the case because the padded SAM queries method adds /2, # /3 etc to queries that have more than one alignment. assert len(alignedReads) == len(set(read.id for read in alignedReads)) readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData( genomeLength, alignedReads) significantOffsets = list( findSignificantOffsets(baseCountAtOffset, readCountAtOffset, self.minReads, self.homogeneousCutoff)) self.report( ' %d alignment%s (of %d unique %s) read from %s' % (samFilter.alignmentCount, s( samFilter.alignmentCount), len(samFilter.queryIds), 'query' if len(samFilter.queryIds) == 1 else 'queries', alignmentFile)) self.report(' %d of which %s aligned to %s' % (len(alignedReads), 'was' if len(alignedReads) == 1 else 'were', referenceId)) self.report(' Reference genome length %d' % genomeLength) self.report(' Found %d significant location%s' % (len(significantOffsets), s(len(significantOffsets)))) self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset) if not significantOffsets: self.report(' No significant locations found.') return if self.saveReducedFASTA: self.saveReducedFasta(significantOffsets, outputDir) self._plotCoverageAndSignificantLocations(referenceId, alignmentFile, readCountAtOffset, genomeLength, significantOffsets, outputDir) self.saveSignificantOffsets(significantOffsets, outputDir) for read in alignedReads: read.setSignificantOffsets(significantOffsets) self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength, significantOffsets, baseCountAtOffset, readCountAtOffset, outputDir) # Save the reference. filename = join(outputDir, 'reference.fasta') self.report(' Saving reference to', filename) reference = self.referenceGenomes[referenceId] Reads([reference]).save(filename) # Extract a consensus according to bcftools. self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir) return (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM)
'--idSuffix', default='', help=('Add this string to the end of the read ids. This is added ' 'after the string added by --editIds (if also used).')) parser.add_argument( '--editIds', action='store_true', default=False, help=('Add "-mutations:N" to the end of each read id, where N ' 'is the number of mutations introduced to the read.')) addFASTACommandLineOptions(parser) args = parser.parse_args() reads = parseFASTACommandLineOptions(args) rate = args.rate verbose = args.verbose editIds = args.editIds idSuffix = args.idSuffix format_ = 'fastq' if args.fastq else 'fasta' for read in reads: count = len(mutateRead(read, rate)) if verbose: print('%d mutation%s made in read (len %d) %s' % (count, s(count), len(read), read.id), file=sys.stderr) read.id = (read.id + (('-mutations:%d' % count) if editIds else '') + idSuffix) print(read.toString(format_), end='')
def plotConsistency(significantOffsets, baseCountAtOffset, readsAtOffset, minCommonReads, outfile, title, show): """ """ scores = [] text = [] for offset1 in significantOffsets: rowScores = [] rowText = [] for offset2 in significantOffsets: if offset2 == offset1: rowScores.append(1.0) rowText.append('%d identity (%d read%s)' % (offset1 + 1, len(readsAtOffset[offset1]), s(len(readsAtOffset[offset1])))) else: readsCoveringBoth = (readsAtOffset[offset1] & readsAtOffset[offset2]) # print('analyzeConsistency', offset1, offset2) # print('%d reads cover 1, %d cover 2, %d cover both' % ( # len(readsAtOffset[offset1]), # len(readsAtOffset[offset2]), # len(readsCoveringBoth))) if readsCoveringBoth: commonCount = len(readsCoveringBoth) if commonCount < minCommonReads: rowScores.append(0.0) rowText.append( '%d (%d read%s) vs %d (%d read%s)<br>Too few (%d) ' 'reads in common' % (offset2 + 1, len(readsAtOffset[offset2]), s(len(readsAtOffset[offset2])), offset1 + 1, len(readsAtOffset[offset1]), s(len(readsAtOffset[offset1])), commonCount)) else: bases1 = [] bases2 = [] for read in readsCoveringBoth: bases1.append(read.base(offset1)) bases2.append(read.base(offset2)) if offset1 == 1895 and offset2 == 1816: print('offset1 == 1896 and offset2 == 1817') print(bases1) print(bases2) # f = (adjusted_mutual_info_score if offset2 > offset1 # else adjusted_rand_score) f = (normalized_information_distance if offset2 > offset1 else adjusted_rand_score) rowScores.append(f(bases1, bases2)) rowText.append( '%d (%d read%s) vs %d (%d read%s)<br>%d read%s in ' 'common' % (offset2 + 1, len(readsAtOffset[offset2]), s(len(readsAtOffset[offset2])), offset1 + 1, len(readsAtOffset[offset1]), s(len(readsAtOffset[offset1])), commonCount, s(commonCount))) # print('offset %d vs %d = %.4f (%d)' % # (offset1, offset2, rowScores[-1], commonCount)) else: rowScores.append(-0.25) rowText.append('%d vs %d, no reads in common' % (offset1 + 1, offset2 + 1)) scores.append(rowScores) text.append(rowText) data = [ # go.Heatmap(x=x, y=x, z=scores, name='ARI'), go.Heatmap(z=scores, name='ARI', text=text) # , colorscale='Viridis'), ] layoutDict = dict( xaxis={ 'title': 'Significant location index', }, yaxis={ 'title': 'Score', }, ) if title: layoutDict['title'] = title layout = go.Layout(layoutDict) fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
def plotConsistentComponents(referenceId, genomeLength, components, significantOffsets, outfile, infoFile, outputDir, title='xxx', show=False, titleFontSize=12, axisFontSize=12): """ Plot consistent connected components. """ def offsetsToLocationsStr(offsets): return ', '.join(map(lambda i: str(i + 1), sorted(offsets))) data = [] with open(infoFile, 'w') as fp: print('There are %d significant location%s: %s' % (len(significantOffsets), s(len(significantOffsets)), offsetsToLocationsStr(significantOffsets)), file=fp) for count, component in enumerate(components, start=1): print('Processing component %d, with %d consistent component%s' % (count, len(component.consistentComponents), s(len(component.consistentComponents))), file=fp) # Get the reference sequence for the component. reads = list( FastaReads( join(outputDir, 'component-%d-consensuses.fasta' % count))) reference = reads[0] length = len(reference) minOffset = min(component.offsets) maxOffset = max(component.offsets) print(' Offset range: %d to %d' % (minOffset + 1, maxOffset + 1), file=fp) # Add a top line to represent the reference. data.append( go.Scatter(x=(minOffset + 1, maxOffset + 1), y=(1.05, 1.05), hoverinfo='text', name=('Reference component %s' % count), text=('Reference component %s, %d offsets' % (count, len(component.offsets))))) # Add vertical lines at the start and end of this component. data.append( go.Scatter( x=(minOffset + 1, minOffset + 1), y=(-0.05, 1.05), mode='lines', hoverinfo='none', line={ 'color': '#eee', }, showlegend=False, )) data.append( go.Scatter( x=(maxOffset + 1, maxOffset + 1), y=(-0.05, 1.05), mode='lines', hoverinfo='none', line={ 'color': '#eee', }, showlegend=False, )) for ccCount, cc in enumerate(component.consistentComponents, start=1): ccSummary = ('Component read count %d, offsets covered %d/%d' % (len(cc.reads), len( cc.nucleotides), len(component.offsets))) # Get the consistent connected component consensus. consensus = reads[ccCount] assert ('consistent-component-%d' % ccCount) in consensus.id print(' Processing consistent component', ccCount, file=fp) print(' Component sequence:', consensus.sequence, file=fp) print(' %d offset%s: %s' % (len(cc.nucleotides), s(len(cc.nucleotides)), offsetsToLocationsStr(cc.nucleotides)), file=fp) match = compareDNAReads(reference, consensus) print(matchToString(match, reference, consensus, indent=' '), file=fp) identicalMatchCount = match['match']['identicalMatchCount'] ambiguousMatchCount = match['match']['ambiguousMatchCount'] # The match fraction will ignore gaps in the consensus # sequence as it is padded with '-' chars to align it to # the reference. fraction = (identicalMatchCount + ambiguousMatchCount) / ( length - len(match['read2']['gapOffsets'])) x = [] y = [fraction] * len(cc.nucleotides) text = [] identical = [] for index, offset in enumerate(sorted(component.offsets)): if offset in cc.nucleotides: consensusBase = consensus.sequence[index] referenceBase = reference.sequence[index] if consensusBase == referenceBase: identical.append(len(x)) # x axis values are 1-based (locations, not offsets) x.append(offset + 1) text.append( 'Location: %d, component: %s, reference: %s' '<br>Component nucleotides: %s<br>%s' % (offset + 1, consensusBase, referenceBase, baseCountsToStr( cc.nucleotides[offset]), ccSummary)) data.append( go.Scatter(x=x, y=y, hoverinfo='text', selectedpoints=identical, showlegend=False, text=text, mode='markers', selected={'marker': { 'color': 'blue', }}, unselected={'marker': { 'color': 'red', }})) # Add the significant offsets. n = len(significantOffsets) data.append( go.Scatter(x=[i + 1 for i in significantOffsets], y=[-0.05] * n, text=[ 'Location %d' % (offset + 1) for offset in significantOffsets ], hoverinfo='text', mode='markers', name='Significant locations')) layout = go.Layout( title=title, titlefont={ 'size': titleFontSize, }, xaxis={ 'range': (0, genomeLength + 1), 'title': 'Genome location', 'titlefont': { 'size': axisFontSize, }, }, yaxis={ 'range': (-0.1, 1.1), 'title': 'Nucleotide identity with reference sequence', 'titlefont': { 'size': axisFontSize, }, }, hovermode='closest', ) fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
def saveClosestReferenceConsensus(self, referenceId, components, baseCountAtOffset, genomeLength, alignedReads, referenceInsertions, outputDir): """ Calculate and save the best consensus to a reference genome. @param referenceId: The C{str} id of the reference sequence. @param components: A C{list} of C{ComponentByOffsets} instances. @param baseCountAtOffset: A C{list} of C{Counter} instances giving the count of each nucleotide at each genome offset. @param genomeLength: The C{int} length of the genome the reads were aligned to. @param alignedReads: A list of C{AlignedRead} instances. @param referenceInsertions: A C{dict} keyed by read id (the read that would cause a reference insertion). The values are lists of 2-tuples, with each 2-tuple containing an offset into the reference sequence and the C{str} of nucleotide that would be inserted starting at that offset. @param outputDir: A C{str} directory path. @return: A tuple of (consensus, unwantedReads, wantedCcReadCount, wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset). """ def ccMatchCount(cc, reference, drawFp, drawMessage): """ Count the matches between a consistent component and a reference genome. @param cc: A C{ConsistentComponent} instance. @param reference: A C{Read} instance. @param drawFp: A file pointer to write information about draws (if any) to. @param drawMessage: A C{str} message to write to C{drawFp}. If the string contains '%(baseCounts)s' that will be replaced by a string representation of the base counts (in C{counts}) obtained from C{baseCountsToStr}. If not, the base count info will be printed after the message. @return: The C{int} count of bases that match the reference for the offsets covered by the consistent component. """ referenceSequence = reference.sequence nucleotides = cc.nucleotides count = 0 for offset in nucleotides: message = (drawMessage + (' location %d: base counts' % (offset + 1)) + ' %(baseCounts)s.') referenceBase = referenceSequence[offset] componentBase = commonest(nucleotides[offset], referenceBase, drawFp=drawFp, drawMessage=message) count += int(componentBase == referenceBase) return count def sortedConsistentComponent(component, reference, fp): """ Sort the consistent components in the given C{ComponentByOffsets} instance according to how well they match the passed reference. The sort order is by increasing match score, so the best consistent component is last. @param component: A C{ComponentByOffsets} instance. @param reference: A C{Read} instance. @param fp: A file pointer to write information to. @return: The C{int} index of the best consistent component. """ result = [] for index, cc in enumerate(component.consistentComponents): matchCount = ccMatchCount( cc, reference, fp, ' Consistent component %d base draw' % (index + 1)) score = matchCount / len(cc.nucleotides) print(' Consistent component %d (%d reads) has %d exact ' 'matches with the reference, out of the %d offsets it ' 'covers (%.2f%%).' % (index + 1, len(cc.reads), matchCount, len( cc.nucleotides), score * 100.0), file=fp) result.append((score, len(cc.nucleotides), index, cc)) result.sort() return result reference = self.referenceGenomes[referenceId] fields = reference.id.split(maxsplit=1) if len(fields) == 1: referenceIdRest = '' else: referenceIdRest = ' ' + fields[1] infoFile = join(outputDir, 'reference-consensus.txt') self.report(' Saving closest consensus to reference info to', infoFile) with open(infoFile, 'w') as infoFp: print('Building consensus at significant offsets.', file=infoFp) consensus = [None] * genomeLength offsetsDone = set() wantedReads = set() unwantedReads = set() for count, component in enumerate(components, start=1): print('\nExamining component %d with %d locations: %s' % (count, len(component.offsets), commas(map(lambda offset: offset + 1, component.offsets))), file=infoFp) componentOffsets = set(component.offsets) sortedCcs = sortedConsistentComponent(component, reference, infoFp) while componentOffsets - offsetsDone: # The following pop call will raise an IndexError if # the sorted cc list is empty. But if it's empty we # shouldn't be here, because the set of included # offsets should at that point include everything in # this component. Having the naked pop here ensures we # get an exception if this assumption is incorrect. # It's like having an assert to test that we found all # the component's offsets following the loop. score, _, ccIndex, cc = sortedCcs.pop() print(' Incorporating nucleotides from consistent ' 'component %d (%d reads, score %.2f, covering %d ' 'locations (%d still undecided in consensus)) to ' 'consensus.' % (ccIndex + 1, len( cc.reads), score, len(cc.nucleotides), len(set(cc.nucleotides) - offsetsDone)), file=infoFp) wantedReads |= cc.reads for offset in sorted(cc.nucleotides): if offset in offsetsDone: continue nucleotides = cc.nucleotides[offset] referenceBase = reference.sequence[offset] base = commonest( nucleotides, referenceBase, drawFp=infoFp, drawMessage=(' WARNING: base count draw at ' 'location %d ' % (offset + 1)) + ' %(baseCounts)s.') assert consensus[offset] is None consensus[offset] = base offsetsDone.add(offset) # Do some reporting on the base just added. if base == referenceBase: mismatch = '' else: consensusBase = commonest( baseCountAtOffset[offset], referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count ' 'draw at location %d ' % (offset + 1)) + ' %(baseCounts)s.') mismatch = ( ' (mismatch: reference has %s, all-read ' 'consensus has %s)' % (referenceBase, consensusBase)) print(' Location %d: %s from nucleotides %s%s' % (offset + 1, base, nucleotides.baseCountsToStr(), mismatch), file=infoFp) # Print info about the cccs that were not needed to cover # all the offsets in this cc. Reverse the list so we print # them in decreasing match score order. for score, _, ccIndex, cc in reversed(sortedCcs): unwantedReads |= cc.reads print(' Will NOT incorporate nucleotides from consistent ' 'component %d (%d reads, score %.2f, covering %d ' 'locations) to consensus.' % (ccIndex + 1, len( cc.reads), score, len(cc.nucleotides)), file=infoFp) # Get the base counts at each offset, from the full set of # aligned reads minus the reads in cccs we're not using. (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = gatherData(genomeLength, set(alignedReads) - unwantedReads) # Process the insignificant offsets, based on all reads EXCEPT # those not used in the connected components. offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone) print('\nAttempting to add bases from %d non-significant ' 'consensus locations, using all reads, EXCEPT those ' 'belonging to unused consistent components:' % len(offsetsToTry), file=infoFp) for offset in offsetsToTry: assert consensus[offset] is None baseCount = wantedReadsBaseCountAtOffset[offset] if baseCount: referenceBase = reference.sequence[offset] base = commonest( baseCount, referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count draw at ' 'location %d' % (offset + 1)) + ' %(baseCounts)s.') print(' Location %d: %s from nucleotides %s' % (offset + 1, base, baseCountsToStr(baseCount)), file=infoFp, end='') if base == referenceBase: print(file=infoFp) else: print(' (mismatch: reference has %s)' % referenceBase, file=infoFp) consensus[offset] = base offsetsDone.add(offset) # Process remaining insignificant offsets, using ALL reads # (i.e., including those in cccs that we wanted to avoid # using). At this point, this is the best we can do with these # final offsets (otherwise we will get gaps - which in some # cases may actually might be preferable because the reference # sequence may not be fully covered by the actual infection # sequence). offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone) print('\nAttempting to add bases from %d non-significant ' 'consensus locations, using all reads, INCLUDING those ' 'belonging to unused consistent components:' % len(offsetsToTry), file=infoFp) for offset in offsetsToTry: assert consensus[offset] is None referenceBase = reference.sequence[offset] baseCount = baseCountAtOffset[offset] if baseCount: base = commonest( baseCount, referenceBase, drawFp=infoFp, drawMessage=( ' WARNING: consensus base count draw at ' 'location %d' % (offset + 1)) + ' %(baseCounts)s.') print(' Location %d: %s from nucleotides %s' % (offset + 1, base, baseCountsToStr(baseCount)), file=infoFp, end='') else: # The reads did not cover this offset. base = '-' print(' Location %d: -' % (offset + 1), file=infoFp, end='') if base == referenceBase: print(file=infoFp) else: print(' (mismatch: reference has %s)' % referenceBase, file=infoFp) consensus[offset] = base offsetsDone.add(offset) # Sanity check: make sure we processed all offsets. assert offsetsDone == set(range(genomeLength)) consensusId = ( '%s-consensus%s' % (self.shortReferenceId[referenceId], referenceIdRest)) consensus = Read(consensusId, ''.join(consensus)) # Print details of the match of the consensus to the reference. match = compareDNAReads(reference, consensus) print('\nOVERALL match with reference:', file=infoFp) print(matchToString(match, reference, consensus, indent=' '), file=infoFp) # Print any insertions to the reference. wantedReadsWithInsertions = (set(referenceInsertions) & (set(alignedReads) - unwantedReads)) if wantedReadsWithInsertions: print('\nReference insertions present in %d read%s:' % (len(wantedReadsWithInsertions), s(len(wantedReadsWithInsertions))), file=infoFp) nucleotides = defaultdict(Counter) for readId in wantedReadsWithInsertions: for (offset, sequence) in referenceInsertions[readId]: for index, base in enumerate(sequence): nucleotides[offset + index][base] += 1 print(nucleotidesToStr(nucleotides, prefix=' '), file=infoFp) else: print('\nReference insertions: none.', file=infoFp) filename = join(outputDir, 'reference-consensus.fasta') self.report(' Saving consensus to', filename) Reads([consensus]).save(filename) wantedCcReadCount = 0 filename = join(outputDir, 'cc-wanted.fastq') with open(filename, 'w') as fp: for wantedCcRead in wantedReads: alignment = wantedCcRead.alignment if not (alignment.is_secondary or alignment.is_supplementary): wantedCcReadCount += 1 print(Read(alignment.query_name, alignment.query_sequence, alignmentQuality(alignment)).toString('fastq'), end='', file=fp) self.report( ' Saved %d read%s wanted in consistent connected components ' 'to %s' % (wantedCcReadCount, s(wantedCcReadCount), filename)) unwantedReads = set(alignedReads) - wantedReads return (consensus, unwantedReads, wantedCcReadCount, wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset)
def mergeDescription(self, a, b, distance): """ Make a textual description of a cluster merge. @param a: An C{int} cluster number. @param b: An C{int} cluster number. @param distance: The C{float} [0.0, 1.0] distance between the clusters. @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}. """ cluster1 = self.readClusters[a] cluster2 = self.readClusters[b] result1 = [] result2 = [] matches = [] sharedCount = matchCount = 0 allOffsets = sorted( set(cluster1.nucleotides) | set(cluster2.nucleotides)) for offset in allOffsets: inCount = 0 if offset in cluster1.nucleotides: result1.append(cluster1.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result1.append('-') if offset in cluster2.nucleotides: result2.append(cluster2.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result2.append('-') if inCount == 2: sharedCount += 1 if (cluster1.nucleotides[offset].commonest & cluster2.nucleotides[offset].commonest): matches.append('*') matchCount += 1 else: multiple = OffsetBases.highestFrequenciesMultiple( cluster1.nucleotides[offset], cluster2.nucleotides[offset]) # Sanity: the multiple cannot be None because that # would mean only one nucleotide is present, and that # case is dealt with by the first part of this if/then. assert multiple is not None if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE: matchCount += 1 matches.append('+') else: matches.append('') else: matches.append('') result1Width = max(len(line) for line in result1) result2Width = max(len(line) for line in result2) return '\n'.join([ ('Merging clusters %d and %d with distance %.2f' % (a, b, distance)), ('Cluster %d has %d read%s, covering %d offset%s' % (a, len(cluster1.reads), s(len(cluster1.reads), ), len(cluster1.nucleotides), s(len(cluster1.nucleotides)))), ('Cluster %d has %d read%s, covering %d offset%s' % (b, len(cluster2.reads), s(len(cluster2.reads)), len(cluster2.nucleotides), s(len(cluster2.nucleotides)))), ('%d matches out of %d shared offsets' % (matchCount, sharedCount)), ] + [ ' %d: %*s %*s %s' % (offset + 1, result1Width, line1, result2Width, line2, match) for (offset, line1, line2, match) in zip(allOffsets, result1, result2, matches) ])
def analyzeReferenceId(self, referenceId, alignmentFile, outputDir): """ Analyze the given reference id in the given alignment file (if an alignment to the reference id is present). @param referenceId: The C{str} id of the reference sequence to analyze. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. @return: C{None} if C{referenceId} is not present in C{alignmentFile} or if no significant offsets are found. Else, a C{dict} containing the signifcant offsets and the consensus sequence that best matches C{referenceId}. """ analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile, outputDir) if analysis: (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset, readsAtOffset, significantOffsets, samFilter, paddedSAM) = analysis else: return insignificantOffsets = set( range(genomeLength)) - set(significantOffsets) reference = self.referenceGenomes[referenceId] referenceSequence = reference.sequence consensus = [] for base in referenceSequence: ob = OffsetBases() ob.incorporateBase(base) consensus.append(ob) readQueue = PriorityQueue() self.updatePriorityQueue(readQueue, alignedReads, consensus, significantOffsets) consensusFilename = join(outputDir, 'reference-consensus.sam') nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam') self.report(' Writing consensus SAM to', consensusFilename) self.report(' Writing non-consensus SAM to', nonConsensusFilename) with samfile(alignmentFile) as sam: consensusAlignment = AlignmentFile(consensusFilename, mode='w', template=sam) nonConsensusAlignment = AlignmentFile(nonConsensusFilename, mode='w', template=sam) # Reads with no significant offsets get written to both output files. readsWithNoSignificantOffsetsCount = 0 for read in alignedReads: if not read.significantOffsets: readsWithNoSignificantOffsetsCount += 1 consensusAlignment.write(read.alignment) nonConsensusAlignment.write(read.alignment) for offset in insignificantOffsets: base = read.base(offset) if base is not None: consensus[offset].incorporateBase(base) self.report(' %d read%s did not overlap any significant offsets' % (readsWithNoSignificantOffsetsCount, s(readsWithNoSignificantOffsetsCount))) readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0 cutoff = self.cutoff while readQueue: mismatchFraction, _ = readQueue.lowestPriority() read = readQueue.pop() if mismatchFraction <= cutoff: # We want this read. Incorporate it into the consensus. readsMatchingConsensusCount += 1 consensusAlignment.write(read.alignment) affectedReads = set() for offset in read.significantOffsets: readBase = read.base(offset) consensus[offset].incorporateBase(readBase) for readAtOffset in readsAtOffset[offset]: if readAtOffset in readQueue: affectedReads.add(readAtOffset) self.updatePriorityQueue(readQueue, affectedReads, consensus, significantOffsets) else: readsNotMatchingConsensusCount += 1 nonConsensusAlignment.write(read.alignment) consensusAlignment.close() nonConsensusAlignment.close() self.report( ' %d read%s matched the consensus, %d did not.' % (readsMatchingConsensusCount, s(readsMatchingConsensusCount), readsNotMatchingConsensusCount)) # Remove the reference bases from the consensus. for offset, base in enumerate(referenceSequence): consensus[offset].unincorporateBase(base) consensusInfoFilename = join(outputDir, 'reference-consensus.txt') self.report(' Writing consensus info to', consensusInfoFilename) with open(consensusInfoFilename, 'w') as fp: consensusSequence = [] for offset in range(genomeLength): # Take a copy of the commonest set because we may pop from # it below. commonest = set(consensus[offset].commonest) referenceBase = referenceSequence[offset] if len(commonest) > 1: nucleotides = ' Nucleotides: %s' % ( consensus[offset].baseCountsToStr()) else: nucleotides = '' if referenceBase in commonest: consensusBase = referenceBase else: if len(commonest) == 1: # Nothing in the included reads covers this offset. consensusBase = '-' elif len(commonest) > 1: # Report a draw (in which the reference base is not # included and so cannot be used to break the draw). commonest.pop() else: consensusBase = commonest.pop() consensusSequence.append(consensusBase) mismatch = '' if referenceBase == consensusBase else ( ' Mismatch (reference has %s)' % referenceBase) print('%d: %s%s%s' % (offset + 1, consensusBase, mismatch, nucleotides), file=fp) consensusRead = Read('gready-consensus-%s' % referenceId, ''.join(consensusSequence)) consensusFilename = join(outputDir, 'reference-consensus.fasta') self.report(' Writing gready consensus info to', consensusFilename) Reads([consensusRead]).save(consensusFilename) return { 'consensusRead': consensusRead, 'significantOffsets': significantOffsets, }
def mergeDescriptionWithOffsetScores(self, a, b, distance): """ Make a textual description of a cluster merge, including per-offset score information. @param a: An C{int} cluster number. @param b: An C{int} cluster number. @param distance: The C{float} [0.0, 1.0] distance between the clusters. @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}. """ cluster1 = self.readClusters[a] cluster2 = self.readClusters[b] result1 = [] result2 = [] offsetScores = [] matches = [] sharedCount = matchCount = 0 allOffsets = sorted( set(cluster1.nucleotides) | set(cluster2.nucleotides)) for offset in allOffsets: inCount = 0 if offset in cluster1.nucleotides: result1.append(cluster1.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result1.append('-') if offset in cluster2.nucleotides: result2.append(cluster2.nucleotides[offset].baseCountsToStr()) inCount += 1 else: result2.append('-') if inCount == 2: sharedCount += 1 if (cluster1.nucleotides[offset].commonest & cluster2.nucleotides[offset].commonest): matches.append('*') matchCount += 1 else: matches.append('') offsetScores.append('%.3f' % min( OffsetBases.multiplicativeDistance( cluster1.nucleotides[offset], cluster2.nucleotides[offset]), OffsetBases.homogeneousDistance( cluster1.nucleotides[offset], cluster2.nucleotides[offset]))) else: matches.append('') offsetScores.append('') result1Width = max(len(line) for line in result1) result2Width = max(len(line) for line in result2) offsetScoresWidth = max(len(line) for line in offsetScores) return '\n'.join([ ('Merging clusters %d and %d with distance %.2f' % (a, b, distance)), ('Cluster %d has %d read%s, covering %d offset%s' % (a, len(cluster1.reads), s(len(cluster1.reads), ), len(cluster1.nucleotides), s(len(cluster1.nucleotides)))), ('Cluster %d has %d read%s, covering %d offset%s' % (b, len(cluster2.reads), s(len(cluster2.reads)), len(cluster2.nucleotides), s(len(cluster2.nucleotides)))), ('%d matches out of %d shared offsets' % (matchCount, sharedCount)), ] + [ ' %d: %*s %*s %*s %s' % (offset + 1, result1Width, line1, result2Width, line2, offsetScoresWidth, offsetScore, match) for (offset, line1, line2, offsetScore, match ) in zip(allOffsets, result1, result2, offsetScores, matches) ])
parser.add_argument( '--alignReads', action='store_true', default=False, help=('If specified, print the reads aligned (with "-" characters) ' 'to the genome.')) addFASTACommandLineOptions(parser) args = parser.parse_args() reads = list(parseFASTACommandLineOptions(args)) # There should only be one "read", the sequence we are to create other # reads from. assert len(reads) == 1, ( 'FASTA input contained %d sequence%s (expected just one).' % (len(reads), s(len(reads)))) genome = reads[0] genomeLen = len(genome) meanLength = args.meanLength if meanLength > genomeLen: raise ValueError('The mean read length (%d) is greater than the ' 'genome length (%d)' % (int(meanLength), genomeLen)) if meanLength <= 0: raise ValueError('The mean read length must be greater than zero') sdLength = args.sdLength if sdLength <= 0.0: raise ValueError('The read length standard deviation must be > 0.0')