Exemple #1
0
    def testReferencesToStr(self):
        """
        The referencesToStr method must return the expected string.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:91',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            self.assertEqual('id1 (length 90)\nid2 (length 91)',
                             ps.referencesToStr())
            ps.close()
Exemple #2
0
    def testAllMMatch(self):
        """
        A simple all-'M' match must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Exemple #3
0
    def testQuerySoftClipReachesRightEdge(self):
        """
        A match with a soft-clipped region that reaches to the right edge of
        the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 5 60 2M4S * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '----TCTAGG'), read)
            ps.close()
Exemple #4
0
    def testQuerySoftClipLeft(self):
        """
        A match with a soft-clipped region that does not extend to the left of
        the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 4 60 2S4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Exemple #5
0
    def testRcSuffix(self):
        """
        A reverse-complimented sequence should have the rcSuffix string added
        to its id when an rcSuffix value is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(rcSuffix='-rc'))
            self.assertEqual(Read('query1-rc', '-TCTAGG---'), read)
            ps.close()
Exemple #6
0
    def testRcNeeded(self):
        """
        A reverse-complimented match (flag = 16) when rcNeeded=True is passed
        must result in the expected (reverse complimented) padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(rcNeeded=True))
            self.assertEqual(Read('query1', '-CCTAGA---'), read)
            ps.close()
Exemple #7
0
    def testQuerySoftClipProtrudesBothSides(self):
        """
        A match with a soft-clipped region that extends to both the left and
        right of the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 4 60 5S5M5S * 0 0 TCTAGGCTGACTAAG ZZZZZZZZZZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'TAGGCTGACT'), read)
            ps.close()
Exemple #8
0
    def testReferenceSkipAlternateChar(self):
        """
        An skip of reference bases must result in the expected padded
        sequence (with gaps) when a queryInsertionChar is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(queryInsertionChar='X'))
            self.assertEqual(Read('query1', '-TCXXTAGG-'), read)
            ps.close()
Exemple #9
0
    def testDropSecondary(self):
        """
        Dropping matches flagged as secondary must give the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 256 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(dropSecondary=True))
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Exemple #10
0
    def testMixedMatchSpecificReferenceButNoMatches(self):
        """
        A request for reads aligned against a reference that exists but that
        has no matches must result in an empty list.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            '@SQ SN:ref2 LN:15',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            self.assertEqual([], list(ps.queries(referenceName='ref2')))
            ps.close()
Exemple #11
0
    def testMixedMatch(self):
        """
        A match that is a mix of M, =, and X must result in the expected
        padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Exemple #12
0
    def testReferenceDeletion(self):
        """
        An deletion of reference bases must result in the expected padded
        sequence (with gaps).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCNNTAGG-'), read)
            ps.close()
Exemple #13
0
    def testMinLengthWithReferenceDeletion(self):
        """
        The minLength specification must be applied after deletion of
        reference bases (which results in the query being lengthened to
        continue the match).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(minLength=7))
            self.assertEqual(Read('query1', '-TCNNTAGG-'), read)
            ps.close()
Exemple #14
0
    def testMinLength(self):
        """
        A request for reads longer than a certain value should result
        in the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(minLength=6))
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Exemple #15
0
    def testKeepQualityControlFailures(self):
        """
        Keeping matches flagged as quality control failures must give the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 512 ref1 4 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read1, read2) = list(ps.queries(keepQCFailures=True))
            self.assertEqual(Read('query1', '-TCTAGG---'), read1)
            self.assertEqual(Read('query2', '---TC-----'), read2)
            ps.close()
Exemple #16
0
    def testAllowDuplicateIds(self):
        """
        It must be possible to allow duplicate ids (in this case due to a
        secondary match).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query1 0 ref1 3 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read1, read2) = list(ps.queries(allowDuplicateIds=True))
            self.assertEqual(Read('query1', '-TCTAGG---'), read1)
            self.assertEqual(Read('query1', '--TC------'), read2)
            ps.close()
Exemple #17
0
    def testUnknownReferences(self):
        """
        Passing an unknown reference name to 'queries' must result in an
        UnknownReference exception.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:91',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            error = ("^Reference 'unknown' is not present in the "
                     "SAM/BAM file\\.$")
            queries = ps.queries(referenceName='unknown')
            assertRaisesRegex(self, UnknownReference, error, list, queries)
            ps.close()
Exemple #18
0
    def testDuplicateIdDisambiguation(self):
        """
        Duplicate ids must be disambiguated if allowDuplicateIds is not given.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query1 0 ref1 3 60 2= * 0 0 TC ZZ',
            'query1 0 ref1 5 60 2S2= * 0 0 TCGA ZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read1, read2, read3) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---'), read1)
            self.assertEqual(Read('query1/1', '--TC------'), read2)
            self.assertEqual(Read('query1/2', '--TCGA----'), read3)
            ps.close()
Exemple #19
0
    def testKF414679SoftClipLeft(self):
        """
        Test for a case that wasn't working.
        """
        seq = ('GCCATGCAGTGGAACTCCACAGCATTCCACCAAGCTCTGC'
               'AGAATCCCAAAGTCAGGGGTTTGTATCTTCTTGCTGGTGGC')
        quality = ('ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ'
                   'ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ')
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 5 60 18S63M * 0 0 %s %s' % (seq, quality),
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', seq[14:]), read)
            ps.close()
Exemple #20
0
    def testUnequalReferenceLengths(self):
        """
        Passing no reference name to 'queries' when the references have
        different lengths must result in an UnequalReferenceLengthError
        exception.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:91',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            error = ('^Your SAM/BAM file has 2 reference sequences, and their '
                     'lengths \(90, 91\) are not all identical\.$')
            queries = ps.queries()
            assertRaisesRegex(self, UnequalReferenceLengthError, error, list,
                              queries)
            ps.close()
Exemple #21
0
    def testReferenceInsertion(self):
        """
        An insertion into the reference must result in the expected padded
        sequence and the expected value in the referenceInsertions dictionary.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2I2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCGG-----'), read)
            self.assertEqual(
                {
                    3: {'T': 1},
                    4: {'A': 1},
                },
                ps.referenceInsertions)
            ps.close()
                    rcSuffix=args.rcSuffix,
                    dropSecondary=args.dropSecondary,
                    dropSupplementary=args.dropSupplementary,
                    dropDuplicates=args.dropDuplicates,
                    allowDuplicateIds=args.allowDuplicateIds,
                    keepQCFailures=args.keepQCFailures,
                    rcNeeded=args.rcNeeded):
                print(read.toString('fasta'), end='')
        except UnequalReferenceLengthError as e:
            raise ValueError(
                str(e) + ' So it is not clear how long the padded output '
                'FASTA sequences should be. Use --referenceName to specify '
                'which reference sequence is the one whose aligned reads you '
                'want printed. Use --listReferenceNames to see a list of '
                'reference sequence names and lengths.')
        except UnknownReference as e:
            raise ValueError(
                str(e) + ' Use --listReferenceNames to see a list of '
                'reference sequence names.')

        if args.listReferenceInsertions:
            if paddedSAM.referenceInsertions:
                print('(0-based) insertions into the reference:\n%s' %
                      nucleotidesToStr(paddedSAM.referenceInsertions, '  '),
                      file=sys.stderr)
            else:
                print('No matches required an insertion into the reference.',
                      file=sys.stderr)
finally:
    paddedSAM.close()