Example #1
0
    def find(self, read):
        """
        Find possible coils in a sequence, using GOR IV. Coils are indicated by
        'C' characters in the GOR IV prediction string.

        @param read: An instance of C{dark.reads.AARead}.
        @return: A generator that yields C{Landmark} instances.
        """
        preds = predictions(read.sequence)
        featureLengthBase = self._dbParams.featureLengthBase
        length = 0
        for offset, prediction in enumerate(preds):
            if prediction == 'C':
                if length:
                    # We're already in a string of C's. Keep counting.
                    length += 1
                else:
                    start = offset
                    length = 1
            else:
                if length:
                    # We were in a string of C's, but it has just ended.
                    yield Landmark(self.NAME, self.SYMBOL, start, length,
                                   scaleLog(length, featureLengthBase))
                    length = 0

        if length:
            # We reached the end of the string still in a coil.
            yield Landmark(self.NAME, self.SYMBOL, start, length,
                           scaleLog(length, featureLengthBase))
Example #2
0
 def testApoamicyaninFiveCoilsWithNonDefaultFeatureLengthBase(self):
     """
     The GOR4Coil landmark finder must find the five expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database. It must have the right scaled length of the landmark after a
     non-default featureLengthBase has been applied.
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     featureLengthBase = 1.5
     dbParams = DatabaseParameters(featureLengthBase=featureLengthBase)
     landmark = GOR4Coil(dbParams)
     result = list(landmark.find(read))
     scaled2 = scaleLog(2, featureLengthBase)
     scaled3 = scaleLog(3, featureLengthBase)
     scaled4 = scaleLog(4, featureLengthBase)
     scaled6 = scaleLog(6, featureLengthBase)
     scaled10 = scaleLog(10, featureLengthBase)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'.
     self.assertEqual([
         Landmark('GOR4Coil', 'GC', 0, 10, scaled10),
         Landmark('GOR4Coil', 'GC', 17, 2, scaled2),
         Landmark('GOR4Coil', 'GC', 28, 6, scaled6),
         Landmark('GOR4Coil', 'GC', 39, 3, scaled3),
         Landmark('GOR4Coil', 'GC', 45, 5, scaled4)
     ], result)
Example #3
0
 def testCoilInMiddleAndAtEnd(self):
     """
     The GOR4Coil landmark finder must find a coil in the middle and at the
     end of a sequence.
     """
     read = AARead('id', 'VICVIC')
     landmark = GOR4Coil()
     result = list(landmark.find(read))
     scaled2 = scaleLog(2, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled0 = scaleLog(0, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is 'EECEEC'.
     self.assertEqual([
         Landmark('GOR4Coil', 'GC', 0, 2, scaled2),
         Landmark('GOR4Coil', 'GC', 5, 1, scaled0)
     ], result)
Example #4
0
    def testTwoReadsTwoLandmarksSameOffsets(self):
        """
        If two identical reads are added, both with two landmarks at the same
        offsets, only one key is added to the backend and both reads are
        listed in the dictionary values for the key.

        Note that A3:A2:-23 is not added to the backend since that would be
        redundant (it's the same two landmarks, with the same separation,
        just with the sign changed).
        """
        dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[])
        be = Backend()
        be.configure(dbParams)
        be.addSubject(
            AARead('id1', 'FRRRFRRRFAAAAAAAAAAAAAAFRRRFRRRFRRRF'), '0')
        be.addSubject(
            AARead('id2', 'FRRRFRRRFAAAAAAAAAAAAAAFRRRFRRRFRRRF'), '1')
        distance23 = str(scaleLog(23, _DEFAULT_DISTANCE_BASE))
        self.assertEqual(
            {
                'A2:A3:' + distance23: {
                    '0': [[0, 9, 23, 13]],
                    '1': [[0, 9, 23, 13]],
                },
            },
            be.d)
Example #5
0
    def check(self, sequence, expected, dbParams=None):
        """
        Check that, given a read with the sequence in C{sequence}, a
        C{THAlphaHelix} finder finds all the helices given in C{expected}.

        @param sequence: A C{str} of amino acids.
        @param expected: A C{str}, consisting of '-' and 'H' characters, where
            spans of 'H' characters indicate where helices are expected to be
            found in C{sequence}. (The non-'H' characters do not have to be
            hyphens, use whatever you like that is not an 'H'.)
        @param dbParams: A C{DatabaseParameters} instance or C{None} if default
            parameters should be used. This can be used to pass a non-default
            featureLengthBase.
        """
        self.assertEqual(len(sequence), len(expected))
        read = AARead('id', sequence)
        finder = THAlphaHelix(dbParams)
        featureLengthBase = finder._dbParams.featureLengthBase
        expectedHelices = []
        for symbol, start, end in stringSpans(expected):
            if symbol == 'H':
                length = end - start
                detail = scaleLog(length, featureLengthBase)
                expectedHelices.append(
                    Landmark('THAlphaHelix', 'THA', start, length, detail))
        self.assertEqual(expectedHelices, list(finder.find(read)))
Example #6
0
 def testOneReadOneLandmarkTwoPeaks(self):
     """
     If one subject is added and it has one landmark and two peaks, two
     pairs are added to the backend.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks])
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', 'FRRRFRRRFASAASA'), '0')
     distance13 = str(scaleLog(13, _DEFAULT_DISTANCE_BASE))
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:P:' + distance13: {'0': [[0, 9, 13, 1]]},
             'A2:P:' + distance10: {'0': [[0, 9, 10, 1]]},
         },
         be.d)
 def testApoamicyaninTwoBetaStrands(self):
     """
     The GOR4BetaStrand landmark finder must find the two expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database.
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     landmark = GOR4BetaStrand()
     result = list(landmark.find(read))
     scaled3 = scaleLog(3, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled5 = scaleLog(5, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'
     self.assertEqual([
         Landmark('GOR4BetaStrand', 'GB', 34, 5, scaled5),
         Landmark('GOR4BetaStrand', 'GB', 42, 3, scaled3)
     ], result)
Example #8
0
 def testApoamicyaninTwoAlphaHelices(self):
     """
     The GOR4AlphaHelix landmark finder must find the two expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database.
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     landmark = GOR4AlphaHelix()
     result = list(landmark.find(read))
     scaled7 = scaleLog(7, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled9 = scaleLog(9, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'
     self.assertEqual([
         Landmark('GOR4AlphaHelix', 'GA', 10, 7, scaled7),
         Landmark('GOR4AlphaHelix', 'GA', 19, 9, scaled9)
     ], result)
Example #9
0
 def testLengthMustBeStoredCorrectly(self):
     """
     The length of a GOR4BetaStrand must be stored correctly (not scaled).
     """
     read = AARead('id', 'DKATIPSESP')
     landmark = GOR4Coil()
     result = list(landmark.find(read))
     scaled9 = scaleLog(9, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     self.assertEqual([Landmark('GOR4Coil', 'GC', 0, 10, scaled9)], result)
Example #10
0
 def testAllCoil(self):
     """
     The GOR4Coil landmark finder must find a coil that spans the whole
     sequence.
     """
     read = AARead('id', 'EA')
     landmark = GOR4Coil()
     result = list(landmark.find(read))
     scaled2 = scaleLog(2, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is 'CC'.
     self.assertEqual([Landmark('GOR4Coil', 'GC', 0, 2, scaled2)], result)
Example #11
0
 def testHashWithSymbolDetail(self):
     """
     The database hash function must return the expected value when the
     landmark it is passed has a repeat count.
     """
     dbParams = DatabaseParameters(landmarks=[], trigPoints=[])
     be = Backend()
     be.configure(dbParams)
     landmark = Landmark('name', 'A', 20, 0, 5)
     trigPoint = TrigPoint('name', 'B', 30)
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual('A5:B:' + distance10, be.hash(landmark, trigPoint))
Example #12
0
 def testStoreLengthCorrectly(self):
     """
     The length of a GOR4BetaStrand must be stored correctly (not scaled).
     """
     seq = 'DKATIPSESPFAAAEVAAIVFAAAEVAAIVVFAAAEVAAIVVDIAKMKYFAAAEVAAIVVDI'
     read = AARead('id', seq)
     dbParams = DatabaseParameters(featureLengthBase=1.5)
     landmark = GOR4AlphaHelix(dbParams)
     result = list(landmark.find(read))
     scaled46 = scaleLog(46, 1.5)
     self.assertEqual([Landmark('GOR4AlphaHelix', 'GA', 10, 46, scaled46)],
                      result)
Example #13
0
 def testApoamicyaninTwoAlphaHelixsWithNonDefaultFeatureLengthBase(self):
     """
     The GOR4AlphaHelix landmark finder must find the two expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database. It must have the right length of the landmark, after a
     non-default featureLengthBase has been applied
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     featureLengthBase = 1.5
     dbParams = DatabaseParameters(featureLengthBase=featureLengthBase)
     landmark = GOR4AlphaHelix(dbParams)
     result = list(landmark.find(read))
     scaled7 = scaleLog(7, featureLengthBase)
     scaled9 = scaleLog(9, featureLengthBase)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'
     self.assertEqual([
         Landmark('GOR4AlphaHelix', 'GA', 10, 7, scaled7),
         Landmark('GOR4AlphaHelix', 'GA', 19, 9, scaled9)
     ], result)
 def testApoamicyaninTwoBetaStrandsWithNonDefaultFeatureLengthBase(self):
     """
     The GOR4BetaStrand landmark finder must find the two expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database. It must have the right scaled length of the landmark, when a
     non-default featureLengthBase is used.
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     featureLengthBase = 1.5
     dbParams = DatabaseParameters(featureLengthBase=featureLengthBase)
     landmark = GOR4BetaStrand(dbParams)
     result = list(landmark.find(read))
     scaled3 = scaleLog(3, featureLengthBase)
     scaled5 = scaleLog(5, featureLengthBase)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'
     self.assertEqual([
         Landmark('GOR4BetaStrand', 'GB', 34, 5, scaled5),
         Landmark('GOR4BetaStrand', 'GB', 42, 3, scaled3)
     ], result)
Example #15
0
 def testHashWithFeatureOnRight(self):
     """
     The database hash function must return the expected (positive offset)
     hash when the second feature is to the right of the first.
     """
     dbParams = DatabaseParameters(landmarks=[], trigPoints=[])
     be = Backend()
     be.configure(dbParams)
     landmark = Landmark('name', 'A', 20, 0)
     trigPoint = TrigPoint('name', 'B', 30)
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual('A:B:' + distance10, be.hash(landmark, trigPoint))
 def testBetaStrandAtStartAndInMiddle(self):
     """
     The GOR4BetaStrand landmark finder must find a beta strand that
     occurs at the start of a sequence, as well as one that appears
     in the middle.
     """
     read = AARead('id', 'VICVICV')
     landmark = GOR4BetaStrand()
     result = list(landmark.find(read))
     scaled4 = scaleLog(4, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is 'EECEEEC'.
     self.assertEqual([Landmark('GOR4BetaStrand', 'GB', 2, 4, scaled4)],
                      result)
Example #17
0
 def testApoamicyaninFiveCoils(self):
     """
     The GOR4Coil landmark finder must find the five expected landmarks
     in a fragment of the APOAMICYANIN sequence from the GOR IV reference
     database.
     """
     seq = 'DKATIPSESPFAAAEVADGAIVVDIAKMKYETPELHVKVGDTVTWINREA'
     read = AARead('id', seq)
     landmark = GOR4Coil()
     result = list(landmark.find(read))
     scaled2 = scaleLog(2, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled3 = scaleLog(3, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled5 = scaleLog(5, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     scaled10 = scaleLog(10, DatabaseParameters.DEFAULT_FEATURE_LENGTH_BASE)
     # The GOR IV secondary structure prediction is
     # 'CCCCCCCCCCHHHHHHHCCHHHHHHHHHHHCCCCEEEEECCEEEEEEEEC'.
     self.assertEqual([
         Landmark('GOR4Coil', 'GC', 0, 10, scaled10),
         Landmark('GOR4Coil', 'GC', 17, 2, scaled2),
         Landmark('GOR4Coil', 'GC', 28, 6, scaled5),
         Landmark('GOR4Coil', 'GC', 39, 3, scaled3),
         Landmark('GOR4Coil', 'GC', 45, 5, scaled5)
     ], result)
Example #18
0
 def testHashWithFeatureOnRightAndNonDefaultDistanceBase(self):
     """
     The database hash function must return the expected hash when the
     database has a non-default distance base and the second feature is to
     the right of the first.
     """
     dbParams = DatabaseParameters(landmarks=[], trigPoints=[],
                                   distanceBase=1.5)
     be = Backend()
     be.configure(dbParams)
     landmark = Landmark('name', 'A', 20, 0)
     trigPoint = TrigPoint('name', 'B', 30)
     distance10 = str(scaleLog(10, 1.5))
     self.assertEqual('A:B:' + distance10, be.hash(landmark, trigPoint))
Example #19
0
 def testNonDefaultFeatureLengthBase(self):
     """
     If a non-default featureLengthBase is in the parameters given to
     the finder, it must be used.
     """
     featureLengthBase = 3.0
     dbParams = DatabaseParameters(featureLengthBase=featureLengthBase)
     read = AARead('id', 'RF' * 20)
     finder = THAlphaHelix(dbParams)
     self.assertEqual(featureLengthBase, finder._dbParams.featureLengthBase)
     # The sequence has length 40, and floor(log base 3 of 40) = 3.
     expectedSymbolDetail = scaleLog(40, featureLengthBase)
     self.assertEqual(3, expectedSymbolDetail)
     feature = list(finder.find(read))[0]
     self.assertEqual(expectedSymbolDetail, feature.symbolDetail)
Example #20
0
 def testOneReadTwoLandmarks(self):
     """
     If one subject is added and it has two landmarks, one key is added
     to the backend.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[])
     be = Backend()
     be.configure(dbParams)
     be.addSubject(
         AARead('id', 'FRRRFRRRFAAAAAAAAAAAAAAFRRRFRRRFRRRF'), '0')
     distance23 = str(scaleLog(23, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:A3:' + distance23: {'0': [[0, 9, 23, 13]]},
         },
         be.d)
Example #21
0
 def testOneReadOneLandmarkTwoPeaksLimitOnePairPerLandmark(self):
     """
     If one subject is added and it has one landmark and two peaks, but a
     limit of one pair per landmarks is imposed, only one key is added to
     the backend.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], limitPerLandmark=1)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', 'FRRRFRRRFASAASA'), '0')
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:P:' + distance10: {'0': [[0, 9, 10, 1]]},
         },
         be.d)
Example #22
0
 def testOneReadOneLandmarkTwoPeaksIntermediateMaxDistance(self):
     """
     If one subject is added and it has one landmark and two peaks, but a
     maximum distance is imposed that makes one of the peaks too far
     away, only one key is added to the backend.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], maxDistance=11)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', 'FRRRFRRRFASAASA'), '0')
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:P:' + distance10: {'0': [[0, 9, 10, 1]]},
         },
         be.d)
Example #23
0
 def testOneReadOneLandmarkTwoPeaksIntermediateMinDistance(self):
     """
     If one subject is added and it has one landmark and two peaks, but an
     intermediate minimum distance is imposed, only the key for the pair
     that exceeds the minimum distance is added to the backend.
     """
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], minDistance=11)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', 'FRRRFRRRFASAASA'), '0')
     distance13 = str(scaleLog(13, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:P:' + distance13: {'0': [[0, 9, 13, 1]]},
         },
         be.d)
Example #24
0
    def _isHelix(self, startOffset, lastGoodOffset, extremaCount):
        """
        Helper function to check whether the current subsequence (beginning at
        C{startOffset} in a read) can be emitted by C{find} as a possible
        helix.

        @param startOffset: The C{int} offset where the possible helix started
            in the read.
        @param lastGoodOffset: The C{int} offset where we encountered the last
            AA (an extremum) suspected of being in the helix.
        @param extremaCount: The C{int} number of extrema found in this
            possible helix.
        @return: A C{Landmark} instance if the subsequence is of sufficient
            length and has enough extrema to qualify as a helix. Else C{None}.
        """
        helixLength = lastGoodOffset - startOffset + 1
        if (helixLength >= self.MIN_HELIX_LENGTH
                and extremaCount >= self.MIN_EXTREMA_COUNT):
            return Landmark(
                self.NAME, self.SYMBOL, startOffset, helixLength,
                scaleLog(helixLength, self._dbParams.featureLengthBase))
Example #25
0
 def testOneReadOneLandmarkOnePeakDistanceBase(self):
     """
     If a non-default distanceBase of 2.0 is used, the right distance needs
     to be calculated. In this case, the offsets are 10 AA apart, and the
     distanceBase scaling will change that to a 3 (since int(log base 2 10)
     = 3), though we don't test the 3 value explicitly since that may change
     if we ever change the scale function. That's desirable, but we already
     have tests in test_distance.py that will break in that case.
     """
     distanceBase = 2.0
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks],
                                   distanceBase=distanceBase)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', 'FRRRFRRRFASA'), '0')
     distance10 = str(scaleLog(10, distanceBase))
     self.assertEqual(
         {
             'A2:P:' + distance10: {'0': [[0, 9, 10, 1]]},
         },
         be.d)
Example #26
0
 def testMultipleSubjectOffsets(self):
     """
     If one subject is added and it has one landmark and one peak separated
     by 10 bases and then, later in the subject, the same pair with the
     same separation, one key must be added to the backend and it
     should have two subject offsets.  Note that minDistance and
     maxDistance are used to discard the matches some longer and shorter
     distance pairs that only have one subject offset (i.e., that only
     appear in the subject once).
     """
     seq = 'FRRRFRRRFASA'
     dbParams = DatabaseParameters(landmarks=[AlphaHelix],
                                   trigPoints=[Peaks], minDistance=5,
                                   maxDistance=10)
     be = Backend()
     be.configure(dbParams)
     be.addSubject(AARead('id', seq + seq), '0')
     distance10 = str(scaleLog(10, _DEFAULT_DISTANCE_BASE))
     self.assertEqual(
         {
             'A2:P:' + distance10: {'0': [[0, 9, 10, 1], [12, 9, 22, 1]]},
         },
         be.d)
Example #27
0
    def __init__(self, sequences, labels, defaultLabel=None, **kwargs):
        """
        Base class for using cluster analysis to evaluate how well various
        feature finders and database parameter settings can separate a set of
        sequences. The clustering is based on feature offset deltas.

        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param labels: A C{dict} with a label for each sequence id in
            C{sequences}. These are the known categories of each sequence.
        @param defaultLabel: If not C{None}, a label to use for reads whose ids
            are not present in C{labels}. If C{None} and a read id has no label
            a ValueError is raised.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @raises ValueError: If the id of a read is not in labels and no default
            label has been set, or if there are no reads in C{sequences}.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences
        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)
        allOffsetDeltas = []
        trueLabels = []

        for read in reads:
            trueLabel = labels.get(read.id, defaultLabel)
            if trueLabel is None:
                raise ValueError('Read %r has no corresponding label' %
                                 read.id)
            trueLabels.append(trueLabel)
            offsetDeltas = Counter()
            scannedRead = backend.scan(read)
            for landmark, trigPoint in backend.getScannedPairs(scannedRead):
                delta = scaleLog(trigPoint.offset - landmark.offset,
                                 database.dbParams.distanceBase)
                offsetDeltas[delta] += 1
            allOffsetDeltas.append(offsetDeltas)

        nReads = len(reads)

        if nReads == 0:
            raise ValueError('No sequences were found in %r' % sequences)

        # Don't check that len(reads) == len(labels). I.e., ignore extra labels
        # to make using this class interactively more convenient.

        # Create an affinity matrix. Initially set all values to 1.0 so we
        # don't need to later initialize the diagonal.
        affinity = np.ones((nReads, nReads))

        for row, offsetDeltas in enumerate(allOffsetDeltas):
            for col in range(row + 1, nReads):
                affinity[row,
                         col] = affinity[col,
                                         row] = (self.affinityFromOffsetDeltas(
                                             allOffsetDeltas[row],
                                             allOffsetDeltas[col]))

        self.nReads = nReads
        self.affinity = affinity
        self.trueLabels = trueLabels