def test_read_pslx(self): mapping = {} for record in SeqIO.parse(get_data('blat_input.fa'), 'fasta'): mapping[record.id] = record.seq header, rows = Blat.read_pslx(get_data('blat_output.pslx'), mapping) self.assertEqual(11067, len(rows)) expect_pslx_header = [ 'match', 'mismatch', 'repmatch', 'ncount', 'qgap_count', 'qgap_bases', 'tgap_count', 'tgap_bases', 'strand', 'qname', 'qsize', 'qstart', 'qend', 'tname', 'tsize', 'tstart', 'tend', 'block_count', 'block_sizes', 'qstarts', 'tstarts', 'qseqs', 'tseqs', ] self.assertEqual(expect_pslx_header, header)
def test_pslx_row_to_pysam_single_block(self): pslx_row = { 'score': 20, 'tseqs': ['AATACCAAATACATGATATA'], 'tstarts': [3432307], 'tstart': 3432307, 'block_sizes': [20], 'qname': 'seq1', 'tname': 'Y', 'qstarts': [93], 'strand': '+', 'qseqs': ['AATACCATACATGATATA'], 'percent_ident': 100.0, 'qseq_full': 'AGCCTCCCAAGTAGCTGGGACTACAGGCGCCCGCCACTACGCCCGGCTAATTTTTTGTATTTTTAGTAGAGACGGGGTTTCACCGTTTT' 'AGCCAGGATGGTCTCGATCTCCTGACCTCATGATCCGCCCGCCTCGGC', } read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) self.assertEqual(23, read.reference_id) self.assertEqual(Interval(93, 112), query_coverage_interval(read))
def test_pslx_row_to_pysam_revcomp_deletion(self, cache): pslx_row = { 'block_count': 2, 'tstarts': [2205, 2281], 'block_sizes': [50, 34], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [0, 50], 'strand': '-', 'qseq_full': 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTA', 'score': 1, 'qseqs': [ 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG', ], 'tseqs': [ 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG', ], } read = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) assert read.reference_id == 3 assert query_coverage_interval(read) == Interval(0, 83) assert read.reference_start == 2205 assert read.cigar == [(CIGAR.EQ, 51), (CIGAR.D, 26), (CIGAR.EQ, 33)] assert read.query_sequence[ 0:50] == 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA' assert read.query_sequence[50:] == 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG'
def test_pslx_row_to_pysam_duplication(self): reference = { '14': MockObject(seq=MockLongString( 'TTCTTCCATGCCCCCTAATCATGGCCACATTGTATCAGCCTGAGCATGAGCAACAGCACCATGGCCACATACGGGAATGGGCCTCATTGGTGTAATATTTGGCAGATTCTCTCCACACCCCCCGTGGCGGTCTGGCTTACTGTTAAGAAGGGTAACCTTAAAAAATACATTTCCCACTCCAGAAAATACTCATATGTGGCCTGTTAGCAGCACAAGAAGGGTGAAAGCAATGCCCATTCCTGCCTCCCTCCCCCTGCTCACCTCCACGTCCCTGTTTGCCCCTTTGTAGGTGAAGTGAGTATATTCAGCGTCTTCATGGCAGGGGAGAGGGTGTATTAATCCGTCTATGTCCGCTGGAAAGGCAGTCTCTGAGCGGGCCACAAGGGTTCAGCCATGGCCCATCCAATAACCTTTTTGATGACTTGGATGAAGAGACAAACATTCCAACCACATTCAAAGATCCAGACCTCCAAAGTGTGGCTCATTTGGTAGATAATGGAATTATATTTGGAAAGCATTTCCCGCAGCTGGGATGATGGGTCAAAAACAGATAGCATTTTACCAGATCATATTTGTGTGTGTGTGTGTGCGCGCGTGTGTGTGTGTGTGTGTGTGTGTTTTAAATTCAGTTTCCCAACTACAGGATG', offset=73014463, )) } pslx_row = { 'block_count': 2, 'tstarts': [73014606, 73014747], 'block_sizes': [141, 30], 'qname': '', 'tname': '14', 'qstarts': [0, 239], 'strand': '+', 'qseq_full': 'AAGAAGGGTAACCTTAAAAAATACATTTCCCACTCCAGAAAATACTCATATGTGGCCTGTTAGCAGCACAAGAAGGGTGAAAGCAATGCCCATTCCTGCCTCCCTCCCCCTGCTCACCTCCACGTCCCTGTTTGCCCCTTTACTCATATGTGGCCTGTTAGCAGCACAAGAAGGGTGAAAGCAATGCCCATTCCTGCCTCCCTCCCCCTGCTCACCTCCACGTCCCTGTTTGCCCCTTTGTAGGTGAAGTGAGTATATTCAGCGTCTTC', 'score': 1, } read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, reference) self.assertEqual(13, read2.reference_id) self.assertEqual(73014606, read2.reference_start) self.assertEqual([(CIGAR.M, 141), (CIGAR.I, 98), (CIGAR.M, 30)], _cigar.convert_for_igv(read2.cigar)) self.assertEqual(Interval(0, len(pslx_row['qseq_full']) - 1), query_coverage_interval(read2))
def test_pslx_row_to_pysam_gapped_alignment_with_reference(self): pslx_row = { 'block_count': 1, 'tstarts': [950, 7233], 'block_sizes': [47, 100], 'qname': 'seq1', 'tname': 'fake', 'qstarts': [0, 47], 'strand': '+', 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCC' 'AATTTTGCAGATTATAAGAT' 'CAATAGATATTTATTGTAAA' 'ATGCACAAATAGTGCAACAT' 'TTCTTAAAGTAGACCGTGAA' 'ATACTTCATGTTGCCATGTT', 'score': 1, } read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(0, read.reference_id) self.assertEqual(Interval(0, 146), query_coverage_interval(read)) self.assertEqual(950, read.reference_start) self.assertEqual([(CIGAR.EQ, 53), (CIGAR.D, 6236), (CIGAR.EQ, 94)], read.cigar)
def test_pslx_row_to_pysam_gapped_alignment(self, cache): pslx_row = { 'block_count': 1, 'tstarts': [950, 7233], 'block_sizes': [47, 100], 'qname': 'seq1', 'tname': 'fake', 'qstarts': [0, 47], 'strand': '+', 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCC' 'AATTTTGCAGATTATAAGAT' 'CAATAGATATTTATTGTAAA' 'ATGCACAAATAGTGCAACAT' 'TTCTTAAAGTAGACCGTGAA' 'ATACTTCATGTTGCCATGTT', 'score': 1, } read = Blat.pslx_row_to_pysam(pslx_row, cache, None) assert read.reference_id == 0 assert query_coverage_interval(read) == Interval(0, 146) assert read.reference_start == 950 assert read.cigar == [(CIGAR.M, 47), (CIGAR.D, 6236), (CIGAR.M, 100)]
def test_overlapping_blat_blocks_error(self): row = { 'strand': '+', 'qname': 'seq23', 'tname': '7', 'block_sizes': [54, 53, 36, 80, 29], 'qstarts': [0, 55, 108, 143, 223], 'tstarts': [61279112, 61279166, 61397315, 61990208, 62366144], 'score': 207, 'percent_ident': 91.3, 'qseq_full': ( 'CAAAAGGAAATACCTTCACATAAATTCTAGACGGAAGCAATCTGAGAAACTTTTATTGTGATTTGTGCATTCACTTCACAGAGTTAAAACTTTCTTTTGATT' 'GAGCAGTTTGAAACTCTGTTTTTGTAGAATCTGCAAGTGGACATTTGGAGCGCTTTGAGGCCTATGGTGGAAAAGGAAATATCTTCACAGGAAAACTAGATA' 'GAAGTATTCTGAGAAACTTCTTTGTGATGTATGCAGTCATATCTCAGA') } cache = Mock(reference_id=MockFunction(6)) with self.assertRaises(AssertionError): Blat.pslx_row_to_pysam(row, cache, None)
def test_pslx_row_to_pysam_full_reverse(self): pslx_row = { 'match': 128, 'mismatch': 0, 'repmatch': 0, 'ncount': 0, 'qgap_count': 0, 'qgap_bases': 0, 'tgap_count': 0, 'tgap_bases': 0, 'strand': '-', 'qname': 'seq1', 'tname': 'reference3', 'tsize': 3711, 'block_sizes': [128], 'qstarts': [117], 'tstarts': [2187], 'qseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' ], 'tseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' ], '_index': 1, 'score': 128, 'percent_ident': 100.0, 'qseq_full': 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT', } read = Blat.pslx_row_to_pysam(pslx_row, self.cache, None) self.assertEqual(3, read.reference_id) self.assertEqual([(CIGAR.S, 117), (CIGAR.M, 128)], read.cigar) self.assertEqual(2187, read.reference_start) self.assertEqual(Interval(117, 244), query_coverage_interval(read))
def test_simple(self): row = { 'match': 142, 'mismatch': 0, 'repmatch': 0, 'ncount': 0, 'qgap_count': 0, 'qgap_bases': 0, 'tgap_count': 0, 'tgap_bases': 0, 'strand': '-', 'qname': 'seq1', 'qsize': 204, 'qstart': 0, 'qend': 142, 'tname': '17', 'tsize': 81195210, 'tstart': 32673408, 'tend': 32673550, 'block_count': 1, 'block_sizes': [142], 'qstarts': [62], 'tstarts': [32673408], '_index': 880, 'score': 142, 'percent_ident': 100.0, 'qseq_full': ( 'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC' 'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT' )} refseq = {'17': Mock(seq=MockLongString( 'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG' 'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC' 'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC', 32673407))} cache = Mock(reference_id=MockFunction(16)) read = Blat.pslx_row_to_pysam(row, cache, refseq) self.assertEqual(16, read.reference_id) self.assertEqual('17', read.reference_name) self.assertEqual(row['qseq_full'], reverse_complement(read.query_sequence)) self.assertEqual([(CIGAR.S, 62), (CIGAR.EQ, 142)], read.cigar)
def test_pslx_row_to_pysam_simple_with_reference(self): pslx_row = { 'tstarts': [950], 'block_sizes': [53], 'qname': 'seq1', 'tname': 'fake', 'qstarts': [0], 'strand': '+', 'score': 0, 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCCAATTTT', } read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(0, read.reference_id) self.assertEqual(Interval(0, 52), query_coverage_interval(read)) self.assertEqual(950, read.reference_start) self.assertEqual(1003, read.reference_end) self.assertEqual([(CIGAR.EQ, 53)], read.cigar)
def test_pslx_row_to_pysam_simple_with_reference(self, cache): pslx_row = { 'tstarts': [950], 'block_sizes': [53], 'qname': 'seq1', 'tname': 'fake', 'qstarts': [0], 'strand': '+', 'score': 0, 'qseq_full': 'ATCTAATAACTTGATCAATA' 'TCTGTGATTATATTTTCATT' 'GCCTTCCAATTTT', } read = Blat.pslx_row_to_pysam(pslx_row, cache, REFERENCE_GENOME) assert read.reference_id == 0 assert query_coverage_interval(read) == Interval(0, 52) assert read.reference_start == 950 assert read.reference_end == 1003 assert read.cigar == [(CIGAR.EQ, 53)]
def test_pslx_row_to_pysam_revcomp_deletion(self): pslx_row = { 'block_count': 2, 'tstarts': [2205, 2281], 'block_sizes': [50, 34], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [0, 50], 'strand': '-', 'qseq_full': 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTA', 'score': 1, 'qseqs': [ 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' ], 'tseqs': [ 'TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', 'CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG' ] } read = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(3, read.reference_id) self.assertEqual(Interval(0, 83), query_coverage_interval(read)) self.assertEqual(2205, read.reference_start) self.assertEqual([(CIGAR.EQ, 51), (CIGAR.D, 26), (CIGAR.EQ, 33)], read.cigar) self.assertEqual('TAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCA', read.query_sequence[0:50]) self.assertEqual('CCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG', read.query_sequence[50:])
def test_pslx_row_to_pysam_inversion(self): s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT' # first part of the inversion pslx_row = { 'block_count': 1, 'tstarts': [1114], 'block_sizes': [120], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [125], 'strand': '+', 'qseq_full': s, 'score': 1, 'qseqs': [ 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG' 'TTTTCATTTCTGTATGTTAAT' ], 'tseqs': [ 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG' 'TTTTCATTTCTGTATGTTAAT' ], } read1 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(3, read1.reference_id) self.assertEqual(Interval(125, 244), query_coverage_interval(read1)) self.assertEqual(1114, read1.reference_start) self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], read1.cigar) # second part of the inversion pslx_row = { 'block_count': 1, 'tstarts': [2187], 'block_sizes': [128], 'qname': 'seq1', 'tname': 'reference3', 'qstarts': [117], 'strand': '-', 'qseq_full': s, 'score': 1, 'qseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT' 'TCTGTGTTTACAGGGCTTTCATGCTCAG' ], 'tseqs': [ 'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT' 'TCTGTGTTTACAGGGCTTTCATGCTCAG' ], } read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME) self.assertEqual(3, read2.reference_id) self.assertEqual(2187, read2.reference_start) self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], read2.cigar) self.assertEqual(Interval(117, 244), query_coverage_interval(read2)) self.assertEqual(read1.query_sequence, reverse_complement(read2.query_sequence))