def test_alignment_to_pairs_001(self): """Check output of alignment agrees. To fully test the functionality, the alignment contains: - insertions - deletions - a run length larger than `self.max_run`, that will be capped to the max_run """ query_name = 'query' reference_id = 1 reference_start = 10 query_sequence = 'ACATGATGTAC' cigarstring = '3=1I2=1D5=' flag = 0 qualities = array.array('B', [2, 1, 4, 5, 1, 1, 2, 16, 2, 3, 4]) aln = common.initialise_alignment( query_name, reference_id, reference_start, query_sequence, cigarstring, flag, query_qualities=qualities) expected = ( (10, ('A', 2)), (11, ('C', 1)), (12, ('A', 3)), (None, ('T', 3)), (13, ('G', 1)), (14, ('A', 1)), (15, ('*', 1)), (16, ('T', 2)), (17, ('G', 3)), (18, ('T', 2)), (19, ('A', 3)), (20, ('C', 3))) got = tuple(self.ls._alignment_to_pairs(aln)) self.assertEqual(got, expected)
def create_simple_bam(fname, calls): """Create a small bam file with RLE encoding coded in the qscores.""" ref_len = len(simple_data['ref']) header = {'HD': {'VN': '1.0'}, 'SQ': [{'LN': ref_len, 'SN': 'ref'}]} tmp_file = '{}.tmp'.format(fname) with pysam.AlignmentFile(tmp_file, 'wb', reference_names=[ 'ref', ], reference_lengths=[ ref_len, ], header=header) as output: for index, basecall in enumerate(calls): a = common.initialise_alignment( query_name=basecall['query_name'], reference_id=0, reference_start=0, query_sequence=basecall['seq'], cigarstring=basecall['cigarstring'], flag=basecall['flag'], query_qualities=basecall['quality'], tags=basecall['tags']) output.write(a) pysam.sort("-o", fname, tmp_file) os.remove(tmp_file) pysam.index(fname)
def test_derived(self): """Test arguments derived from inputs.""" alignment = common.initialise_alignment(**self.input_kwargs) expected_kwargs = { 'query_alignment_start': 1, 'query_alignment_end': 12, 'query_alignment_sequence': 'CCCTGTTGATC'} for key, expected in expected_kwargs.items(): got = getattr(alignment, key) self.assertEqual(got, expected)
def setUpClass(cls): """Create temporary files and bam file Ref T T A A C T T T G Read1 A A C T T T G Read2 T A A A C T T T G """ cls.bam_input = tempfile.NamedTemporaryFile(suffix='.bam').name cls.bam_output = tempfile.NamedTemporaryFile(suffix='.bam').name cls.ref_fname = tempfile.NamedTemporaryFile(suffix='.fasta').name with open(cls.ref_fname, 'w') as fasta: fasta.write('>ref\n') fasta.write('TTAACTTTG\n') header = { 'HD': {'VN': '1.0'}, 'SQ': [{'LN': 9, 'SN': 'ref'}, ]} basecalls = { 'read1': { 'query_name': 'read1', 'reference_id': 0, 'reference_start': 2, 'query_sequence': 'AACTTTG', 'cigarstring': '7=', 'flag': 0, 'mapping_quality': 50}, 'read2': { 'query_name': 'read2', 'reference_id': 0, 'reference_start': 1, 'query_sequence': 'TAAACTTTG', 'cigarstring': '3=1I5=', 'flag': 0, 'mapping_quality': 50}} tmp_file = '{}.tmp'.format(cls.bam_input) with pysam.AlignmentFile(tmp_file, 'wb', header=header) as bam: for basecall in basecalls.values(): record = common.initialise_alignment(**basecall) bam.write(record) pysam.sort("-o", cls.bam_input, tmp_file) os.remove(tmp_file) pysam.index(cls.bam_input)
def test_compression(self): """Compress alignment. ref: TACCCATGTTGATCG --> TACATGTGATCG seq: gCCCA*GTTGATCtt --> gCA*GTGATCt cigar: 1S4=1D7=2S --> 1S2=1D6=1S """ alignment = common.initialise_alignment(**self.alignment_kwargs) ref_rle = medaka.rle.RLEConverter(self.ref) compressed_alignment = medaka.rle._compress_alignment( alignment, ref_rle) real_outputs = { 'cigarstring': '1S2=1D6=1S', 'query_sequence': 'GCAGTGATCT', 'query_alignment_start': 1, 'query_alignment_end': 9, 'query_alignment_sequence': 'CAGTGATC', 'reference_start': 2, 'reference_end': 11} for key, expected in real_outputs.items(): got = getattr(compressed_alignment, key) self.assertEqual(got, expected)
def test_inputs(self): """Test inputs are correctly passed to alignment.""" alignment = common.initialise_alignment(**self.input_kwargs) for key, expected in self.input_kwargs.items(): got = getattr(alignment, key) self.assertEqual(expected, got)