Esempio n. 1
0
    def _getReferenceIds(self, alignedReferences, referenceIds):
        """
        Figure out which reference ids we can process.

        @param alignedReferences: A C{set} of C{str} reference ids found in
            the passed reference files.
        @param referenceIds: A C{list} of C{str} reference ids for which
            processing has specifically been requested, or C{None}.
        @return: A C{set} of C{str} reference ids to process.
        """
        if referenceIds:
            # Specific reference ids were given. Check that each appears in
            # some alignment file and that we have a genome for each. Any
            # error here causes a message to stderr and exit.
            missing = set(referenceIds) - alignedReferences
            if missing:
                print(
                    'Alignments against the following reference id%s are not '
                    'present in any alignment file:\n%s' %
                    (s(len(missing)), '\n'.join('  %s' % id_
                                                for id_ in sorted(missing))),
                    file=sys.stderr)
                sys.exit(1)

            missing = set(referenceIds) - set(self.referenceGenomes)
            if missing:
                print('Reference id%s %s not present in any reference genome '
                      'file.' % (s(len(missing)), commas(missing)),
                      file=sys.stderr)
                sys.exit(1)
        else:
            # We weren't told which reference ids to specifically examine
            # the alignments of, so examine all available references
            # mentioned in any alignment file and that we also have a
            # genome for. Mention any references from alignment files that
            # we can't process due to lack of genome.
            missing = alignedReferences - set(self.referenceGenomes)
            if missing:
                self.report(
                    'No analysis will be performed on reference%s %s '
                    '(found in SAM/BAM alignment file(s) headers) because no '
                    'corresponding reference genome was found.' %
                    (s(len(missing)), commas(missing)))

            referenceIds = alignedReferences & set(self.referenceGenomes)

            if referenceIds:
                self.report('Examining %d reference%s: %s' %
                            (len(referenceIds), s(
                                len(referenceIds)), commas(referenceIds)))
            else:
                print(
                    'Nothing to do! No genome could be found for any aligned '
                    'reference. Found reference%s: %s' %
                    (s(len(alignedReferences)), commas(alignedReferences)),
                    file=sys.stderr)
                sys.exit(1)

        return referenceIds
Esempio n. 2
0
    def _readReferenceGenomes(self, referenceGenomeFiles):
        """
        Read reference genomes from files and check that any duplicates have
        identical sequences.

        @param referenceGenomeFiles: A C{list} of C{str} names of FASTA files
            containing reference genomes.
        @raise ValueError: If a reference genome is found in more than one file
            and the sequences are not identical.
        @return: A C{dict} keyed by C{str} sequence id with C{dark.Read}
            values holding reference genomes.
        """
        result = {}
        seen = {}
        for filename in referenceGenomeFiles:
            for read in FastaReads(filename):
                id_ = read.id
                if id_ in seen:
                    if result[id_].sequence != read.sequence:
                        raise ValueError(
                            'Reference genome id %r was found in two files '
                            '(%r and %r) but with different sequences.' %
                            (id_, seen[id_], filename))
                else:
                    seen[id_] = filename
                    result[id_] = read

        self.report('Read %d reference genome%s:\n%s' %
                    (len(result), s(len(result)), '\n'.join(
                        '  %s' % id_ for id_ in sorted(result))),
                    requiredVerbosityLevel=2)

        return result
Esempio n. 3
0
 def __str__(self):
     result = [
         'Cluster with %d read%s:' % (len(self.reads), s(len(self.reads)))
     ]
     for read in self.reads:
         result.append('  %s' % read)
     result.append(nucleotidesToStr(self.nucleotides, prefix='  '))
     return '\n'.join(result)
Esempio n. 4
0
    def _removePreExistingTopLevelOutputDirFiles(self):
        """
        Remove all pre-existing files from the top-level output directory.
        """
        paths = list(
            map(str, chain(Path(self.outputDir).glob('result-summary.txt'))))

        if paths:
            self.report('    Removing %d pre-existing output file%s from '
                        'top-level output directory %s.' %
                        (len(paths), s(len(paths)), self.outputDir),
                        requiredVerbosityLevel=2)
            list(map(unlink, paths))
Esempio n. 5
0
    def summarize(self, fp, count, componentOffsets, referenceSequence):
        """
        Write out a summary of this consistent component.

        @param fp: The file pointer to write to.
        @param count: The C{int} number of this component.
        @param componentOffsets: The C{set} of offsets in this component.
        @param referenceSequence: The C{str} reference sequence.
        """
        plural = s(len(self.reads))
        print('    Component %d: %d read%s, covering %d offset%s' %
              (count, len(self.reads), plural, len(
                  self.nucleotides), s(len(self.nucleotides))),
              file=fp)
        print('    Nucleotide counts for each offset:', file=fp)
        print(nucleotidesToStr(self.nucleotides, '      '), file=fp)
        print('    Consensus sequence: %s' %
              self.consensusSequence(componentOffsets, referenceSequence, fp),
              file=fp)
        print('    Read%s:' % plural, file=fp)
        for read in sorted(self.reads):
            print('     ', read, file=fp)
Esempio n. 6
0
def plotBaseFrequencies(significantOffsets,
                        baseCountAtOffset,
                        readCountAtOffset,
                        outfile,
                        title=None,
                        sampleName=None,
                        valuesFile=None,
                        minReads=5,
                        homogeneousCutoff=0.9,
                        sortOn=None,
                        histogram=False,
                        show=False,
                        titleFontSize=12,
                        axisFontSize=12):
    """
    Plot sorted base frequencies at signifcant sites.
    """

    subtitle = (
        '<br>%d significant sites. Min %d read%s per site. '
        '%.2f homogeneity cutoff.' %
        (len(significantOffsets), minReads, s(minReads), homogeneousCutoff))

    if sortOn is None:
        title = title or 'Base frequencies (sorted)'
        _plotBaseFrequencies(significantOffsets, baseCountAtOffset,
                             readCountAtOffset, outfile, title + subtitle,
                             show, titleFontSize, axisFontSize)
    elif sortOn == 'max':
        title = title or 'Maximum base frequency'
        result = _plotSortedMaxBaseFrequencies(
            significantOffsets, baseCountAtOffset, readCountAtOffset, outfile,
            title + subtitle, histogram, show, titleFontSize, axisFontSize)
    else:
        assert sortOn == 'entropy', ('Unknown --sortOn value: %r' % sortOn)
        title = title or 'Base frequency entropy'
        result = _plotBaseFrequenciesEntropy(significantOffsets,
                                             baseCountAtOffset,
                                             readCountAtOffset, outfile,
                                             title + subtitle, histogram, show,
                                             titleFontSize, axisFontSize)

    if valuesFile:
        # The following will fail if sortOn is None (no result, above).
        with open(valuesFile, 'w') as fp:
            dump(
                {
                    'sampleName': sampleName,
                    'text': [text for _, text in result],
                    'values': [value for value, _ in result],
                }, fp)
Esempio n. 7
0
def plotSignificantOffsets(fig, row, col, significantOffsets, genomeLength):
    """
    Plot the genome offsets that are significant.
    """
    n = len(significantOffsets)
    trace = go.Scatter(x=[i + 1 for i in significantOffsets],
                       y=[1.0] * n,
                       mode='markers',
                       showlegend=False)
    fig.append_trace(trace, row, col)
    fig['layout']['annotations'][1]['text'] = (
        '%d significant genome location%s' % (n, s(n)))
    fig['layout']['xaxis'].update({
        'range': (0, genomeLength + 1),
    })
Esempio n. 8
0
    def _removePreExistingReferenceDirFiles(self, directory):
        """
        Remove all pre-existing files from the output directory for a
        particular reference sequence alignment.

        @param directory: The C{str} directory to examine.
        """
        # This prevents us from doing a run that results in (say) 6
        # component files and then later doing a run that results in
        # only 5 components and erroneously thinking that
        # component-6-2.fasta etc. are from the most recent run.
        paths = list(
            map(
                str,
                chain(
                    Path(directory).glob('*.fasta'),
                    Path(directory).glob('*.html'),
                    Path(directory).glob('*.txt'))))

        if paths:
            self.report('    Removing %d pre-existing output file%s from %s '
                        'directory.' % (len(paths), s(len(paths)), directory),
                        requiredVerbosityLevel=2)
            list(map(unlink, paths))
Esempio n. 9
0
    def initialReferenceIdAnalysis(self, referenceId, alignmentFile,
                                   outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """

        # Make sure this reference id is in this alignment file and if so
        # get its length (and check it's the same as the length of the
        # sequence given in the reference file).
        with samfile(alignmentFile) as sam:
            tid = sam.get_tid(referenceId)
            if tid == -1:
                # This referenceId is not in this alignment file.
                self.report('    Reference %s not in alignment file.' %
                            referenceId)
                return
            else:
                genomeLength = sam.lengths[tid]
                # Sanity check.
                assert genomeLength == len(self.referenceGenomes[referenceId])

        if self.plotSAM:
            filename = join(outputDir, 'reads.html')
            self.report('    Saving reads alignment plot to %s' % filename)
            plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}),
                    filename,
                    title=referenceId,
                    jitter=0.45)

        alignedReads = []
        samFilter = SAMFilter(
            alignmentFile,
            referenceIds={referenceId},
            dropDuplicates=True,
            dropSupplementary=True,
            # dropSecondary=True,
            storeQueryIds=True)
        paddedSAM = PaddedSAM(samFilter)
        for query in paddedSAM.queries(addAlignment=True):
            assert len(query) == genomeLength
            alignedReads.append(
                AlignedRead(query.id, query.sequence, query.alignment))

        # Sanity check that all aligned reads have different ids. This
        # should be the case because the padded SAM queries method adds /2,
        # /3 etc to queries that have more than one alignment.
        assert len(alignedReads) == len(set(read.id for read in alignedReads))

        readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
            genomeLength, alignedReads)

        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   self.minReads, self.homogeneousCutoff))

        self.report(
            '    %d alignment%s (of %d unique %s) read from %s' %
            (samFilter.alignmentCount, s(
                samFilter.alignmentCount), len(samFilter.queryIds), 'query'
             if len(samFilter.queryIds) == 1 else 'queries', alignmentFile))
        self.report('    %d of which %s aligned to %s' %
                    (len(alignedReads),
                     'was' if len(alignedReads) == 1 else 'were', referenceId))
        self.report('    Reference genome length %d' % genomeLength)
        self.report('    Found %d significant location%s' %
                    (len(significantOffsets), s(len(significantOffsets))))

        self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset)

        if not significantOffsets:
            self.report('    No significant locations found.')
            return

        if self.saveReducedFASTA:
            self.saveReducedFasta(significantOffsets, outputDir)

        self._plotCoverageAndSignificantLocations(referenceId, alignmentFile,
                                                  readCountAtOffset,
                                                  genomeLength,
                                                  significantOffsets,
                                                  outputDir)

        self.saveSignificantOffsets(significantOffsets, outputDir)

        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)

        self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength,
                                            significantOffsets,
                                            baseCountAtOffset,
                                            readCountAtOffset, outputDir)

        # Save the reference.
        filename = join(outputDir, 'reference.fasta')
        self.report('    Saving reference to', filename)
        reference = self.referenceGenomes[referenceId]
        Reads([reference]).save(filename)

        # Extract a consensus according to bcftools.
        self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir)

        return (genomeLength, alignedReads, readCountAtOffset,
                baseCountAtOffset, readsAtOffset, significantOffsets,
                samFilter, paddedSAM)
Esempio n. 10
0
        '--idSuffix',
        default='',
        help=('Add this string to the end of the read ids. This is added '
              'after the string added by --editIds (if also used).'))

    parser.add_argument(
        '--editIds',
        action='store_true',
        default=False,
        help=('Add "-mutations:N" to the end of each read id, where N '
              'is the number of mutations introduced to the read.'))

    addFASTACommandLineOptions(parser)
    args = parser.parse_args()
    reads = parseFASTACommandLineOptions(args)
    rate = args.rate
    verbose = args.verbose
    editIds = args.editIds
    idSuffix = args.idSuffix
    format_ = 'fastq' if args.fastq else 'fasta'

    for read in reads:
        count = len(mutateRead(read, rate))
        if verbose:
            print('%d mutation%s made in read (len %d) %s' %
                  (count, s(count), len(read), read.id),
                  file=sys.stderr)
        read.id = (read.id + (('-mutations:%d' % count) if editIds else '') +
                   idSuffix)
        print(read.toString(format_), end='')
Esempio n. 11
0
def plotConsistency(significantOffsets, baseCountAtOffset, readsAtOffset,
                    minCommonReads, outfile, title, show):
    """
    """
    scores = []
    text = []

    for offset1 in significantOffsets:
        rowScores = []
        rowText = []
        for offset2 in significantOffsets:
            if offset2 == offset1:
                rowScores.append(1.0)
                rowText.append('%d identity (%d read%s)' %
                               (offset1 + 1, len(readsAtOffset[offset1]),
                                s(len(readsAtOffset[offset1]))))
            else:
                readsCoveringBoth = (readsAtOffset[offset1]
                                     & readsAtOffset[offset2])

                # print('analyzeConsistency', offset1, offset2)
                # print('%d reads cover 1, %d cover 2, %d cover both' % (
                #     len(readsAtOffset[offset1]),
                #     len(readsAtOffset[offset2]),
                #     len(readsCoveringBoth)))

                if readsCoveringBoth:
                    commonCount = len(readsCoveringBoth)
                    if commonCount < minCommonReads:
                        rowScores.append(0.0)
                        rowText.append(
                            '%d (%d read%s) vs %d (%d read%s)<br>Too few (%d) '
                            'reads in common' %
                            (offset2 + 1, len(readsAtOffset[offset2]),
                             s(len(readsAtOffset[offset2])), offset1 + 1,
                             len(readsAtOffset[offset1]),
                             s(len(readsAtOffset[offset1])), commonCount))
                    else:
                        bases1 = []
                        bases2 = []
                        for read in readsCoveringBoth:
                            bases1.append(read.base(offset1))
                            bases2.append(read.base(offset2))

                        if offset1 == 1895 and offset2 == 1816:
                            print('offset1 == 1896 and offset2 == 1817')
                            print(bases1)
                            print(bases2)
                        # f = (adjusted_mutual_info_score if offset2 > offset1
                        #      else adjusted_rand_score)
                        f = (normalized_information_distance
                             if offset2 > offset1 else adjusted_rand_score)

                        rowScores.append(f(bases1, bases2))
                        rowText.append(
                            '%d (%d read%s) vs %d (%d read%s)<br>%d read%s in '
                            'common' %
                            (offset2 + 1, len(readsAtOffset[offset2]),
                             s(len(readsAtOffset[offset2])), offset1 + 1,
                             len(readsAtOffset[offset1]),
                             s(len(readsAtOffset[offset1])), commonCount,
                             s(commonCount)))
                        # print('offset %d vs %d = %.4f (%d)' %
                        # (offset1, offset2, rowScores[-1], commonCount))
                else:
                    rowScores.append(-0.25)
                    rowText.append('%d vs %d, no reads in common' %
                                   (offset1 + 1, offset2 + 1))

        scores.append(rowScores)
        text.append(rowText)

    data = [
        # go.Heatmap(x=x, y=x, z=scores, name='ARI'),
        go.Heatmap(z=scores, name='ARI', text=text)  # , colorscale='Viridis'),
    ]

    layoutDict = dict(
        xaxis={
            'title': 'Significant location index',
        },
        yaxis={
            'title': 'Score',
        },
    )

    if title:
        layoutDict['title'] = title

    layout = go.Layout(layoutDict)

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
Esempio n. 12
0
def plotConsistentComponents(referenceId,
                             genomeLength,
                             components,
                             significantOffsets,
                             outfile,
                             infoFile,
                             outputDir,
                             title='xxx',
                             show=False,
                             titleFontSize=12,
                             axisFontSize=12):
    """
    Plot consistent connected components.
    """
    def offsetsToLocationsStr(offsets):
        return ', '.join(map(lambda i: str(i + 1), sorted(offsets)))

    data = []

    with open(infoFile, 'w') as fp:

        print('There are %d significant location%s: %s' %
              (len(significantOffsets), s(len(significantOffsets)),
               offsetsToLocationsStr(significantOffsets)),
              file=fp)

        for count, component in enumerate(components, start=1):

            print('Processing component %d, with %d consistent component%s' %
                  (count, len(component.consistentComponents),
                   s(len(component.consistentComponents))),
                  file=fp)

            # Get the reference sequence for the component.
            reads = list(
                FastaReads(
                    join(outputDir, 'component-%d-consensuses.fasta' % count)))

            reference = reads[0]
            length = len(reference)
            minOffset = min(component.offsets)
            maxOffset = max(component.offsets)

            print('  Offset range: %d to %d' % (minOffset + 1, maxOffset + 1),
                  file=fp)

            # Add a top line to represent the reference.
            data.append(
                go.Scatter(x=(minOffset + 1, maxOffset + 1),
                           y=(1.05, 1.05),
                           hoverinfo='text',
                           name=('Reference component %s' % count),
                           text=('Reference component %s, %d offsets' %
                                 (count, len(component.offsets)))))

            # Add vertical lines at the start and end of this component.
            data.append(
                go.Scatter(
                    x=(minOffset + 1, minOffset + 1),
                    y=(-0.05, 1.05),
                    mode='lines',
                    hoverinfo='none',
                    line={
                        'color': '#eee',
                    },
                    showlegend=False,
                ))
            data.append(
                go.Scatter(
                    x=(maxOffset + 1, maxOffset + 1),
                    y=(-0.05, 1.05),
                    mode='lines',
                    hoverinfo='none',
                    line={
                        'color': '#eee',
                    },
                    showlegend=False,
                ))

            for ccCount, cc in enumerate(component.consistentComponents,
                                         start=1):

                ccSummary = ('Component read count %d, offsets covered %d/%d' %
                             (len(cc.reads), len(
                                 cc.nucleotides), len(component.offsets)))

                # Get the consistent connected component consensus.
                consensus = reads[ccCount]
                assert ('consistent-component-%d' % ccCount) in consensus.id

                print('  Processing consistent component', ccCount, file=fp)
                print('  Component sequence:', consensus.sequence, file=fp)
                print('  %d offset%s: %s' %
                      (len(cc.nucleotides), s(len(cc.nucleotides)),
                       offsetsToLocationsStr(cc.nucleotides)),
                      file=fp)

                match = compareDNAReads(reference, consensus)
                print(matchToString(match, reference, consensus,
                                    indent='    '),
                      file=fp)

                identicalMatchCount = match['match']['identicalMatchCount']
                ambiguousMatchCount = match['match']['ambiguousMatchCount']

                # The match fraction will ignore gaps in the consensus
                # sequence as it is padded with '-' chars to align it to
                # the reference.
                fraction = (identicalMatchCount + ambiguousMatchCount) / (
                    length - len(match['read2']['gapOffsets']))

                x = []
                y = [fraction] * len(cc.nucleotides)
                text = []
                identical = []
                for index, offset in enumerate(sorted(component.offsets)):
                    if offset in cc.nucleotides:

                        consensusBase = consensus.sequence[index]
                        referenceBase = reference.sequence[index]

                        if consensusBase == referenceBase:
                            identical.append(len(x))

                        # x axis values are 1-based (locations, not offsets)
                        x.append(offset + 1)

                        text.append(
                            'Location: %d, component: %s, reference: %s'
                            '<br>Component nucleotides: %s<br>%s' %
                            (offset + 1, consensusBase, referenceBase,
                             baseCountsToStr(
                                 cc.nucleotides[offset]), ccSummary))

                data.append(
                    go.Scatter(x=x,
                               y=y,
                               hoverinfo='text',
                               selectedpoints=identical,
                               showlegend=False,
                               text=text,
                               mode='markers',
                               selected={'marker': {
                                   'color': 'blue',
                               }},
                               unselected={'marker': {
                                   'color': 'red',
                               }}))

    # Add the significant offsets.
    n = len(significantOffsets)
    data.append(
        go.Scatter(x=[i + 1 for i in significantOffsets],
                   y=[-0.05] * n,
                   text=[
                       'Location %d' % (offset + 1)
                       for offset in significantOffsets
                   ],
                   hoverinfo='text',
                   mode='markers',
                   name='Significant locations'))

    layout = go.Layout(
        title=title,
        titlefont={
            'size': titleFontSize,
        },
        xaxis={
            'range': (0, genomeLength + 1),
            'title': 'Genome location',
            'titlefont': {
                'size': axisFontSize,
            },
        },
        yaxis={
            'range': (-0.1, 1.1),
            'title': 'Nucleotide identity with reference sequence',
            'titlefont': {
                'size': axisFontSize,
            },
        },
        hovermode='closest',
    )

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
Esempio n. 13
0
    def saveClosestReferenceConsensus(self, referenceId, components,
                                      baseCountAtOffset, genomeLength,
                                      alignedReads, referenceInsertions,
                                      outputDir):
        """
        Calculate and save the best consensus to a reference genome.

        @param referenceId: The C{str} id of the reference sequence.
        @param components: A C{list} of C{ComponentByOffsets} instances.
        @param baseCountAtOffset: A C{list} of C{Counter} instances giving
            the count of each nucleotide at each genome offset.
        @param genomeLength: The C{int} length of the genome the reads were
            aligned to.
        @param alignedReads: A list of C{AlignedRead} instances.
        @param referenceInsertions: A C{dict} keyed by read id (the read
            that would cause a reference insertion). The values are lists
            of 2-tuples, with each 2-tuple containing an offset into the
            reference sequence and the C{str} of nucleotide that would be
            inserted starting at that offset.
        @param outputDir: A C{str} directory path.
        @return: A tuple of (consensus, unwantedReads, wantedCcReadCount,
                 wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset).
        """
        def ccMatchCount(cc, reference, drawFp, drawMessage):
            """
            Count the matches between a consistent component and a reference
            genome.

            @param cc: A C{ConsistentComponent} instance.
            @param reference: A C{Read} instance.
            @param drawFp: A file pointer to write information about draws (if
                any) to.
            @param drawMessage: A C{str} message to write to C{drawFp}. If the
                string contains '%(baseCounts)s' that will be replaced by a
                string representation of the base counts (in C{counts})
                obtained from C{baseCountsToStr}. If not, the base count info
                will be printed after the message.
            @return: The C{int} count of bases that match the reference
                for the offsets covered by the consistent component.
            """
            referenceSequence = reference.sequence
            nucleotides = cc.nucleotides
            count = 0
            for offset in nucleotides:
                message = (drawMessage + (' location %d: base counts' %
                                          (offset + 1)) + ' %(baseCounts)s.')
                referenceBase = referenceSequence[offset]
                componentBase = commonest(nucleotides[offset],
                                          referenceBase,
                                          drawFp=drawFp,
                                          drawMessage=message)
                count += int(componentBase == referenceBase)
            return count

        def sortedConsistentComponent(component, reference, fp):
            """
            Sort the consistent components in the given C{ComponentByOffsets}
            instance according to how well they match the passed reference.
            The sort order is by increasing match score, so the best
            consistent component is last.

            @param component: A C{ComponentByOffsets} instance.
            @param reference: A C{Read} instance.
            @param fp: A file pointer to write information to.
            @return: The C{int} index of the best consistent component.
            """
            result = []
            for index, cc in enumerate(component.consistentComponents):
                matchCount = ccMatchCount(
                    cc, reference, fp,
                    '    Consistent component %d base draw' % (index + 1))
                score = matchCount / len(cc.nucleotides)
                print('  Consistent component %d (%d reads) has %d exact '
                      'matches with the reference, out of the %d offsets it '
                      'covers (%.2f%%).' %
                      (index + 1, len(cc.reads), matchCount, len(
                          cc.nucleotides), score * 100.0),
                      file=fp)
                result.append((score, len(cc.nucleotides), index, cc))

            result.sort()
            return result

        reference = self.referenceGenomes[referenceId]
        fields = reference.id.split(maxsplit=1)
        if len(fields) == 1:
            referenceIdRest = ''
        else:
            referenceIdRest = ' ' + fields[1]

        infoFile = join(outputDir, 'reference-consensus.txt')
        self.report('    Saving closest consensus to reference info to',
                    infoFile)

        with open(infoFile, 'w') as infoFp:
            print('Building consensus at significant offsets.', file=infoFp)
            consensus = [None] * genomeLength
            offsetsDone = set()
            wantedReads = set()
            unwantedReads = set()
            for count, component in enumerate(components, start=1):
                print('\nExamining component %d with %d locations: %s' %
                      (count, len(component.offsets),
                       commas(map(lambda offset: offset + 1,
                                  component.offsets))),
                      file=infoFp)
                componentOffsets = set(component.offsets)
                sortedCcs = sortedConsistentComponent(component, reference,
                                                      infoFp)

                while componentOffsets - offsetsDone:
                    # The following pop call will raise an IndexError if
                    # the sorted cc list is empty. But if it's empty we
                    # shouldn't be here, because the set of included
                    # offsets should at that point include everything in
                    # this component. Having the naked pop here ensures we
                    # get an exception if this assumption is incorrect.
                    # It's like having an assert to test that we found all
                    # the component's offsets following the loop.
                    score, _, ccIndex, cc = sortedCcs.pop()

                    print('  Incorporating nucleotides from consistent '
                          'component %d (%d reads, score %.2f, covering %d '
                          'locations (%d still undecided in consensus)) to '
                          'consensus.' %
                          (ccIndex + 1, len(
                              cc.reads), score, len(cc.nucleotides),
                           len(set(cc.nucleotides) - offsetsDone)),
                          file=infoFp)

                    wantedReads |= cc.reads
                    for offset in sorted(cc.nucleotides):
                        if offset in offsetsDone:
                            continue
                        nucleotides = cc.nucleotides[offset]
                        referenceBase = reference.sequence[offset]
                        base = commonest(
                            nucleotides,
                            referenceBase,
                            drawFp=infoFp,
                            drawMessage=('      WARNING: base count draw at '
                                         'location %d ' %
                                         (offset + 1)) + ' %(baseCounts)s.')
                        assert consensus[offset] is None
                        consensus[offset] = base
                        offsetsDone.add(offset)

                        # Do some reporting on the base just added.
                        if base == referenceBase:
                            mismatch = ''
                        else:
                            consensusBase = commonest(
                                baseCountAtOffset[offset],
                                referenceBase,
                                drawFp=infoFp,
                                drawMessage=(
                                    '      WARNING: consensus base count '
                                    'draw at location %d ' % (offset + 1)) +
                                ' %(baseCounts)s.')
                            mismatch = (
                                ' (mismatch: reference has %s, all-read '
                                'consensus has %s)' %
                                (referenceBase, consensusBase))

                        print('    Location %d: %s from nucleotides %s%s' %
                              (offset + 1, base, nucleotides.baseCountsToStr(),
                               mismatch),
                              file=infoFp)

                # Print info about the cccs that were not needed to cover
                # all the offsets in this cc. Reverse the list so we print
                # them in decreasing match score order.
                for score, _, ccIndex, cc in reversed(sortedCcs):
                    unwantedReads |= cc.reads
                    print('  Will NOT incorporate nucleotides from consistent '
                          'component %d (%d reads, score %.2f, covering %d '
                          'locations) to consensus.' %
                          (ccIndex + 1, len(
                              cc.reads), score, len(cc.nucleotides)),
                          file=infoFp)

            # Get the base counts at each offset, from the full set of
            # aligned reads minus the reads in cccs we're not using.
            (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset,
             _) = gatherData(genomeLength,
                             set(alignedReads) - unwantedReads)

            # Process the insignificant offsets, based on all reads EXCEPT
            # those not used in the connected components.
            offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone)
            print('\nAttempting to add bases from %d non-significant '
                  'consensus locations, using all reads, EXCEPT those '
                  'belonging to unused consistent components:' %
                  len(offsetsToTry),
                  file=infoFp)
            for offset in offsetsToTry:
                assert consensus[offset] is None
                baseCount = wantedReadsBaseCountAtOffset[offset]
                if baseCount:
                    referenceBase = reference.sequence[offset]
                    base = commonest(
                        baseCount,
                        referenceBase,
                        drawFp=infoFp,
                        drawMessage=(
                            '    WARNING: consensus base count draw at '
                            'location %d' % (offset + 1)) + ' %(baseCounts)s.')
                    print('  Location %d: %s from nucleotides %s' %
                          (offset + 1, base, baseCountsToStr(baseCount)),
                          file=infoFp,
                          end='')

                    if base == referenceBase:
                        print(file=infoFp)
                    else:
                        print(' (mismatch: reference has %s)' % referenceBase,
                              file=infoFp)
                    consensus[offset] = base
                    offsetsDone.add(offset)

            # Process remaining insignificant offsets, using ALL reads
            # (i.e., including those in cccs that we wanted to avoid
            # using).  At this point, this is the best we can do with these
            # final offsets (otherwise we will get gaps - which in some
            # cases may actually might be preferable because the reference
            # sequence may not be fully covered by the actual infection
            # sequence).
            offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone)
            print('\nAttempting to add bases from %d non-significant '
                  'consensus locations, using all reads, INCLUDING those '
                  'belonging to unused consistent components:' %
                  len(offsetsToTry),
                  file=infoFp)
            for offset in offsetsToTry:
                assert consensus[offset] is None
                referenceBase = reference.sequence[offset]
                baseCount = baseCountAtOffset[offset]
                if baseCount:
                    base = commonest(
                        baseCount,
                        referenceBase,
                        drawFp=infoFp,
                        drawMessage=(
                            '    WARNING: consensus base count draw at '
                            'location %d' % (offset + 1)) + ' %(baseCounts)s.')
                    print('  Location %d: %s from nucleotides %s' %
                          (offset + 1, base, baseCountsToStr(baseCount)),
                          file=infoFp,
                          end='')
                else:
                    # The reads did not cover this offset.
                    base = '-'
                    print('  Location %d: -' % (offset + 1),
                          file=infoFp,
                          end='')

                if base == referenceBase:
                    print(file=infoFp)
                else:
                    print(' (mismatch: reference has %s)' % referenceBase,
                          file=infoFp)
                consensus[offset] = base
                offsetsDone.add(offset)

            # Sanity check: make sure we processed all offsets.
            assert offsetsDone == set(range(genomeLength))

            consensusId = (
                '%s-consensus%s' %
                (self.shortReferenceId[referenceId], referenceIdRest))

            consensus = Read(consensusId, ''.join(consensus))

            # Print details of the match of the consensus to the reference.
            match = compareDNAReads(reference, consensus)
            print('\nOVERALL match with reference:', file=infoFp)
            print(matchToString(match, reference, consensus, indent='  '),
                  file=infoFp)

            # Print any insertions to the reference.
            wantedReadsWithInsertions = (set(referenceInsertions) &
                                         (set(alignedReads) - unwantedReads))
            if wantedReadsWithInsertions:
                print('\nReference insertions present in %d read%s:' %
                      (len(wantedReadsWithInsertions),
                       s(len(wantedReadsWithInsertions))),
                      file=infoFp)
                nucleotides = defaultdict(Counter)
                for readId in wantedReadsWithInsertions:
                    for (offset, sequence) in referenceInsertions[readId]:
                        for index, base in enumerate(sequence):
                            nucleotides[offset + index][base] += 1
                print(nucleotidesToStr(nucleotides, prefix='  '), file=infoFp)
            else:
                print('\nReference insertions: none.', file=infoFp)

        filename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Saving consensus to', filename)
        Reads([consensus]).save(filename)

        wantedCcReadCount = 0
        filename = join(outputDir, 'cc-wanted.fastq')
        with open(filename, 'w') as fp:
            for wantedCcRead in wantedReads:
                alignment = wantedCcRead.alignment
                if not (alignment.is_secondary or alignment.is_supplementary):
                    wantedCcReadCount += 1
                    print(Read(alignment.query_name, alignment.query_sequence,
                               alignmentQuality(alignment)).toString('fastq'),
                          end='',
                          file=fp)
        self.report(
            '    Saved %d read%s wanted in consistent connected components '
            'to %s' % (wantedCcReadCount, s(wantedCcReadCount), filename))

        unwantedReads = set(alignedReads) - wantedReads

        return (consensus, unwantedReads, wantedCcReadCount,
                wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset)
Esempio n. 14
0
    def mergeDescription(self, a, b, distance):
        """
        Make a textual description of a cluster merge.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    multiple = OffsetBases.highestFrequenciesMultiple(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])
                    # Sanity: the multiple cannot be None because that
                    # would mean only one nucleotide is present, and that
                    # case is dealt with by the first part of this if/then.
                    assert multiple is not None
                    if multiple >= ReadCluster.MIN_COMMONEST_MULTIPLE:
                        matchCount += 1
                        matches.append('+')
                    else:
                        matches.append('')
            else:
                matches.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2, match)
            for (offset, line1, line2,
                 match) in zip(allOffsets, result1, result2, matches)
        ])
Esempio n. 15
0
    def analyzeReferenceId(self, referenceId, alignmentFile, outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """
        analysis = self.initialReferenceIdAnalysis(referenceId, alignmentFile,
                                                   outputDir)

        if analysis:
            (genomeLength, alignedReads, readCountAtOffset, baseCountAtOffset,
             readsAtOffset, significantOffsets, samFilter,
             paddedSAM) = analysis
        else:
            return

        insignificantOffsets = set(
            range(genomeLength)) - set(significantOffsets)

        reference = self.referenceGenomes[referenceId]
        referenceSequence = reference.sequence

        consensus = []
        for base in referenceSequence:
            ob = OffsetBases()
            ob.incorporateBase(base)
            consensus.append(ob)

        readQueue = PriorityQueue()
        self.updatePriorityQueue(readQueue, alignedReads, consensus,
                                 significantOffsets)

        consensusFilename = join(outputDir, 'reference-consensus.sam')
        nonConsensusFilename = join(outputDir, 'reference-non-consensus.sam')
        self.report('    Writing consensus SAM to', consensusFilename)
        self.report('    Writing non-consensus SAM to', nonConsensusFilename)

        with samfile(alignmentFile) as sam:
            consensusAlignment = AlignmentFile(consensusFilename,
                                               mode='w',
                                               template=sam)
            nonConsensusAlignment = AlignmentFile(nonConsensusFilename,
                                                  mode='w',
                                                  template=sam)

        # Reads with no significant offsets get written to both output files.
        readsWithNoSignificantOffsetsCount = 0
        for read in alignedReads:
            if not read.significantOffsets:
                readsWithNoSignificantOffsetsCount += 1
                consensusAlignment.write(read.alignment)
                nonConsensusAlignment.write(read.alignment)

                for offset in insignificantOffsets:
                    base = read.base(offset)
                    if base is not None:
                        consensus[offset].incorporateBase(base)

        self.report('    %d read%s did not overlap any significant offsets' %
                    (readsWithNoSignificantOffsetsCount,
                     s(readsWithNoSignificantOffsetsCount)))

        readsMatchingConsensusCount = readsNotMatchingConsensusCount = 0
        cutoff = self.cutoff
        while readQueue:
            mismatchFraction, _ = readQueue.lowestPriority()
            read = readQueue.pop()
            if mismatchFraction <= cutoff:
                # We want this read. Incorporate it into the consensus.
                readsMatchingConsensusCount += 1
                consensusAlignment.write(read.alignment)
                affectedReads = set()
                for offset in read.significantOffsets:
                    readBase = read.base(offset)
                    consensus[offset].incorporateBase(readBase)
                    for readAtOffset in readsAtOffset[offset]:
                        if readAtOffset in readQueue:
                            affectedReads.add(readAtOffset)
                self.updatePriorityQueue(readQueue, affectedReads, consensus,
                                         significantOffsets)
            else:
                readsNotMatchingConsensusCount += 1
                nonConsensusAlignment.write(read.alignment)

        consensusAlignment.close()
        nonConsensusAlignment.close()

        self.report(
            '    %d read%s matched the consensus, %d did not.' %
            (readsMatchingConsensusCount, s(readsMatchingConsensusCount),
             readsNotMatchingConsensusCount))

        # Remove the reference bases from the consensus.
        for offset, base in enumerate(referenceSequence):
            consensus[offset].unincorporateBase(base)

        consensusInfoFilename = join(outputDir, 'reference-consensus.txt')
        self.report('    Writing consensus info to', consensusInfoFilename)

        with open(consensusInfoFilename, 'w') as fp:
            consensusSequence = []
            for offset in range(genomeLength):
                # Take a copy of the commonest set because we may pop from
                # it below.
                commonest = set(consensus[offset].commonest)
                referenceBase = referenceSequence[offset]

                if len(commonest) > 1:
                    nucleotides = ' Nucleotides: %s' % (
                        consensus[offset].baseCountsToStr())
                else:
                    nucleotides = ''

                if referenceBase in commonest:
                    consensusBase = referenceBase
                else:
                    if len(commonest) == 1:
                        # Nothing in the included reads covers this offset.
                        consensusBase = '-'
                    elif len(commonest) > 1:
                        # Report a draw (in which the reference base is not
                        # included and so cannot be used to break the draw).
                        commonest.pop()
                    else:
                        consensusBase = commonest.pop()

                consensusSequence.append(consensusBase)

                mismatch = '' if referenceBase == consensusBase else (
                    ' Mismatch (reference has %s)' % referenceBase)

                print('%d: %s%s%s' %
                      (offset + 1, consensusBase, mismatch, nucleotides),
                      file=fp)

        consensusRead = Read('gready-consensus-%s' % referenceId,
                             ''.join(consensusSequence))
        consensusFilename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Writing gready consensus info to', consensusFilename)
        Reads([consensusRead]).save(consensusFilename)

        return {
            'consensusRead': consensusRead,
            'significantOffsets': significantOffsets,
        }
Esempio n. 16
0
    def mergeDescriptionWithOffsetScores(self, a, b, distance):
        """
        Make a textual description of a cluster merge, including per-offset
        score information.

        @param a: An C{int} cluster number.
        @param b: An C{int} cluster number.
        @param distance: The C{float} [0.0, 1.0] distance between the clusters.
        @return: A C{str} side-by-side descriptions of clusters C{a} and C{b}.
        """
        cluster1 = self.readClusters[a]
        cluster2 = self.readClusters[b]

        result1 = []
        result2 = []
        offsetScores = []
        matches = []
        sharedCount = matchCount = 0

        allOffsets = sorted(
            set(cluster1.nucleotides) | set(cluster2.nucleotides))

        for offset in allOffsets:

            inCount = 0

            if offset in cluster1.nucleotides:
                result1.append(cluster1.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result1.append('-')

            if offset in cluster2.nucleotides:
                result2.append(cluster2.nucleotides[offset].baseCountsToStr())
                inCount += 1
            else:
                result2.append('-')

            if inCount == 2:
                sharedCount += 1
                if (cluster1.nucleotides[offset].commonest
                        & cluster2.nucleotides[offset].commonest):
                    matches.append('*')
                    matchCount += 1
                else:
                    matches.append('')

                offsetScores.append('%.3f' % min(
                    OffsetBases.multiplicativeDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset]),
                    OffsetBases.homogeneousDistance(
                        cluster1.nucleotides[offset],
                        cluster2.nucleotides[offset])))
            else:
                matches.append('')
                offsetScores.append('')

        result1Width = max(len(line) for line in result1)
        result2Width = max(len(line) for line in result2)
        offsetScoresWidth = max(len(line) for line in offsetScores)

        return '\n'.join([
            ('Merging clusters %d and %d with distance %.2f' %
             (a, b, distance)),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (a, len(cluster1.reads), s(len(cluster1.reads), ),
              len(cluster1.nucleotides), s(len(cluster1.nucleotides)))),
            ('Cluster %d has %d read%s, covering %d offset%s' %
             (b, len(cluster2.reads), s(len(cluster2.reads)),
              len(cluster2.nucleotides), s(len(cluster2.nucleotides)))),
            ('%d matches out of %d shared offsets' %
             (matchCount, sharedCount)),
        ] + [
            '  %d: %*s    %*s    %*s    %s' %
            (offset + 1, result1Width, line1, result2Width, line2,
             offsetScoresWidth, offsetScore, match)
            for (offset, line1, line2, offsetScore, match
                 ) in zip(allOffsets, result1, result2, offsetScores, matches)
        ])
Esempio n. 17
0
    parser.add_argument(
        '--alignReads',
        action='store_true',
        default=False,
        help=('If specified, print the reads aligned (with "-" characters) '
              'to the genome.'))

    addFASTACommandLineOptions(parser)
    args = parser.parse_args()
    reads = list(parseFASTACommandLineOptions(args))
    # There should only be one "read", the sequence we are to create other
    # reads from.
    assert len(reads) == 1, (
        'FASTA input contained %d sequence%s (expected just one).' %
        (len(reads), s(len(reads))))
    genome = reads[0]
    genomeLen = len(genome)
    meanLength = args.meanLength

    if meanLength > genomeLen:
        raise ValueError('The mean read length (%d) is greater than the '
                         'genome length (%d)' % (int(meanLength), genomeLen))

    if meanLength <= 0:
        raise ValueError('The mean read length must be greater than zero')

    sdLength = args.sdLength

    if sdLength <= 0.0:
        raise ValueError('The read length standard deviation must be > 0.0')