def test_with_skipped_region_for_plus_strand_clv(): """ AAA <-tail of suffix contig ACGG--GC┘|| <-suffix contig with skip 0123 456789 <-contig coord | 1 ctg_clv^ ^init_ctg_idx <-contig coord ...ACGGTTGCGGT... <-genome 789012345678 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr1' ctg.query_sequence = 'ACGGGCAAA' ctg.cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 3)) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch = MagicMock(return_value='TT') kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=5) assert extract_seq(**kw) == 'ACGGTTGC' assert extract_seq(window=3, **kw) == 'TGC' ref_fa.fetch.assert_called_with('chr1', 11, 13)
def test_hardclip_minus_strand(mock_apautils): """ TT |└CGCACCG <-suffix contig with hardclip | | /// <-hardclip mask 01234567890 <-contig coord icb^ ^cc 1 XXCGCACCG... <-genome 3456789012 <-genome coord | | 1 irb^ ^rc """ ctg = MagicMock() ctg.reference_name = 'chr2' mock_apautils.infer_query_sequence.return_value = 'TTCGCACCG' ctg.cigartuples = ( (S.BAM_CSOFT_CLIP, 2), (S.BAM_CMATCH, 4), (S.BAM_CHARD_CLIP, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='-', ref_clv=5, ref_fa=ref_fa, ctg_clv=2) assert extract_seq(**kw) == 'CGCACCG' assert extract_seq(window=1, **kw) == 'C' assert extract_seq(window=3, **kw) == 'CGC'
def test_with_hardclip_after_clv(mock_apautils): """ TT |└GTGA <-bridge read ATTCGTGA <-bridge contig (hardcipped), chimeric // <-hardclip mask 012345678 <-contig coord icb^ ^cc ...ATTCGTGA... 567890123 <-genome coordinate | |1 irb^ ^rc """ ctg = MagicMock() ctg.reference_name = 'chr2' mock_apautils.infer_query_sequence.return_value = 'ATTCGTGA' ctg.cigartuples = ( (S.BAM_CMATCH, 6), (S.BAM_CHARD_CLIP, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='-', ref_clv=9, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'GTGA' assert extract_seq(window=5, **kw) == 'GTGA'
def test_with_skipped_region_and_insertions_mismatches_for_plus_strand_clv(): """ G ┬ AAA <-tail of suffix contig A TA--GCG┘|| <-suffix contig with skip 0 23 456789 <-contig coord | |x | | ctg_clv ^ ^init_ctg_idx <-contig coord ...A TTCCGCGXXX... <-genome 7 8901234567 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr3' ctg.query_sequence = 'AGTAGCGAAA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CINS, 1), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.return_value = 'CC' kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=6) assert extract_seq(**kw) == 'AGTACCGCG' ref_fa.fetch.assert_called_once_with('chr3', 10, 12) assert extract_seq(window=1, **kw) == 'G' assert extract_seq(window=3, **kw) == 'GCG' assert extract_seq(window=8, **kw) == 'GTACCGCG'
def test_with_skipped_and_deleted_regions_for_plus_strand_clv(): """ AAA <-tail of suffix contig A_TT--GC┘|| <-suffix contig with skip 0 12 345678 <-contig coord | ctg_clv^ ^init_ctg_idx <-contig coord ...ACTTAAGCGGT... <-genome 789012345678 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr3' ctg.query_sequence = 'ATTGCAAA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CDEL, 1), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.return_value = 'AA' kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'ATTAAGC' ref_fa.fetch.assert_called_once_with('chr3', 11, 13) assert extract_seq(window=5, **kw) == 'TAAGC'
def test_with_5_base_inserted_region_for_plus_strand_clv(): """ AATCC ┬ AA <-tail of suffix contig ACGG GCG┘| <-suffix contig with skip 0123 9012 <-contig coord | |1| | ctg_clv^ ^init_ctg_idx <-contig coord ...ACGG GCGXXX... <-genome 7890 1234567 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr1' ctg.query_sequence = 'ACGGAATCCGCGAA' ctg.cigartuples = ( (S.BAM_CMATCH, 4), (S.BAM_CINS, 5), (S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='+', ref_clv=13, ref_fa=ref_fa, ctg_clv=11) assert extract_seq(**kw) == 'ACGGAATCCGCG' assert extract_seq(window=5, **kw) == 'CCGCG'
def test_hardclip_plust_strand(mock_apautils): """ AAA CGCACCG┘ | <-suffix contig with hardclip \\\| | | <-hardclip mask 01234567890 <-contig coord |cc^ ^ice ...XXXACCGTCG... <-genome 234567890123 <-genome coord | 1 | rc^ ^ire """ ctg = MagicMock() ctg.reference_name = 'chr2' mock_apautils.infer_query_sequence.return_value = 'CGCACCGAAA' ctg.cigartuples = ( (S.BAM_CHARD_CLIP, 3), (S.BAM_CMATCH, 4), (S.BAM_CSOFT_CLIP, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='+', ref_clv=8, ref_fa=ref_fa, ctg_clv=6) assert extract_seq(**kw) == 'CGCACCG' assert extract_seq(window=1, **kw) == 'G' assert extract_seq(window=3, **kw) == 'CCG'
def test_with_indel_and_skipped_regions_and_mismatches_for_plus_strand_clv(): """ TC ┬ AA <-tail of suffix contig A---CC GTA__GC┘| <-suffix contig with skip 0 12 567 8901 <-contig coord x x |1 x ctg_clv^ ^init_ctg_idx <-contig coord ...ACTGTC GAATTGC... <-genome 789012 345678901 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr3' ctg.query_sequence = 'ACCTCGTAGCAA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 3), (S.BAM_CMATCH, 2), (S.BAM_CINS, 2), (S.BAM_CMATCH, 3), (S.BAM_CDEL, 2), (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.return_value = 'CTG' kw = dict(contig=ctg, strand='+', ref_clv=19, ref_fa=ref_fa, ctg_clv=9) assert extract_seq(**kw) == 'ACTGCCTCGTAGC' ref_fa.fetch.assert_called_once_with('chr3', 8, 11) assert extract_seq(window=10, **kw) == 'GCCTCGTAGC'
def test_extract_seq_for_minus_strand_clv_supported_by_suffix(): """ TTT <-tail of suffix contig └ACATC <-suffix contig 012345678 <-contig coord icb^ ^ctg_clv ...567890129... <-genome coord irb^ ^ref_clv """ strand = '-' ref_clv = 8 ctg_clv = 3 ref_fa = MagicMock() contig = MagicMock() contig.query_sequence = 'TTTACATCG' contig.cigartuples = ( (S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 6), ) args = contig, strand, ref_clv, ref_fa, ctg_clv assert extract_seq(*args) == 'ACATCG' assert extract_seq(*args, window=1) == 'A' assert extract_seq(*args, window=2) == 'AC' assert extract_seq(*args, window=3) == 'ACA' assert extract_seq(*args, window=4) == 'ACAT'
def test_with_2_base_insertion(): """ GA TTT ┬ <-tail of suffix contig ||└AC TCG <-suffix contig 01234 5678 <-contig coord ici^ ^ctg_clv XXXXX XXX <-genome 45678 9012... <-genome coord | 1 ^ref_clv/iri """ ctg = MagicMock() ctg.reference_name = 'chr1' ctg.query_sequence = 'TTTACGATCG' ctg.cigartuples = ( (S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 2), (S.BAM_CINS, 2), (S.BAM_CMATCH, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='-', ref_clv=7, ref_fa=ref_fa, ctg_clv=3) assert extract_seq(**kw) == 'ACGATCG' assert extract_seq(window=2, **kw) == 'AC' assert extract_seq(window=3, **kw) == 'ACG'
def test_with_skipped_region(): """ TTT <-tail of suffix contig ||└GT--C <-suffix contig with skip 01234 56 <-contig coord icb^ ^ctg_clv <-contig coord ...XXXGTTGC... <-genome 5678901234 <-genome coord | | 1 irb^ ^ref_clv/iri """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'TTTGTC' ctg.cigartuples = ((S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 1)) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch = MagicMock(return_value='TG') kw = dict(contig=ctg, strand='-', ref_clv=8, ref_fa=ref_fa, ctg_clv=3) assert extract_seq(**kw) == 'GTTGC' ref_fa.fetch.assert_called_with('chr2', 10, 12) # **kw needs go after window for py34 syntax assert extract_seq(window=1, **kw) == 'G' assert extract_seq(window=3, **kw) == 'GTT' assert extract_seq(window=4, **kw) == 'GTTG'
def test_extract_seq_for_bridge_with_deletion(): """ AA CG┘ <-bridge read GAC__TCGTC <-bridge contig 012 345678 <-contig coord | ||x | |^cc ^ice ...GACGGTCCTC... <-genome 56789012345 <-genome coord 1| | ^rc ^rce """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'GACTCGTC' ctg.cigartuples = ( (S.BAM_CMATCH, 3), (S.BAM_CDEL, 2), (S.BAM_CMATCH, 5), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='+', ref_clv=15, ref_fa=ref_fa, ctg_clv=5) assert extract_seq(**kw) == 'GACTCG'
def test_extract_seq_for_bridge_with_insertion(): """ AG AA <-inserted bases ┬ GT┘ <-bread read GA CGGTCGC <-bridge contig 01 45678901 <-contig coord x 1| | x cc^ ^ice ...GT CGGTCGC... <-genome 56 78901234 <-genome coord | | rc^ ^ire """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'GAAGCGGTCGC' ctg.cigartuples = ( (S.BAM_CMATCH, 2), (S.BAM_CINS, 2), (S.BAM_CMATCH, 7) ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='+', ref_clv=10, ref_fa=ref_fa, ctg_clv=7) assert extract_seq(**kw) == 'GAAGCGGT'
def test_extract_seq_with_skip_before_and_after_ctg_clv(): """ AA GT┘ <-bridge read G--AGT-GC <-bridge contig 0 123 456 <-contig coord ctg_clv^ ^ice ...GACAGTTGC... <-genome 5678901234 <-genome coord 1 | ref_clv^ ^ire """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'GAGTGC' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.return_value = 'AC' kw = dict(contig=ctg, strand='+', ref_clv=10, ref_fa=ref_fa, ctg_clv=3) assert extract_seq(**kw) == 'GACAGT' ref_fa.fetch.assert_called_once_with('chr2', 6, 8)
def test_hardclip_spanning_clv_from_after_edgecase_3(mock_apautils): """ AAA GTT┘ <-bridge read A-GGTTGCA <-bridge contig | | ///// <-hardclip mask 0 12345678 <-contig coord cc^ ^ice ...ACGGTTGCA... <-genome 7890123456 <-genome coord 1 | | rc^ ^ie """ ctg = MagicMock() ctg.reference_name = 'chr1' mock_apautils.infer_query_sequence.return_value = 'AGGTTGCA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 2), (S.BAM_CHARD_CLIP, 5), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch = MagicMock(return_value='C') kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'ACGGTT'
def test_extract_seq_for_bridge_with_multiple_skips_before_clv(): """ AA TA┘ <-bridge read G-C--CTAGC <-bridge contig 0 1 234567 <-contig coord ||| x ^cc^ice ...GACTGGTAGC... <-genome 56789012345 <-genome coord 1 | | rc^ ^ire """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'GCCTAGC' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 5) ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.side_effect = list(reversed(['A', 'TG'])) # from Right => Left kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'GACTGCTA'
def test_with_two_skipped_regions_and_a_mismatch(): """ TTT <-tail of suffix contig ||└GT--CAG-AC <-suffix contig with skip 01234 567 890 <-contig coord init_ctg_clv^ ^cc x <-contig coord ...XXXGTTGCGGCAC... <-genome 56789012345678 <-genome coord | 1 ^ref_clv/iri """ ctg = MagicMock() ctg.reference_name = 'chr2' ctg.query_sequence = 'TTTGTCAGAC' ctg.cigartuples = ( (S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.side_effect = ['TG', 'C'] kw = dict(contig=ctg, strand='-', ref_clv=8, ref_fa=ref_fa, ctg_clv=3) assert extract_seq(**kw) == 'GTTGCAGCAC' assert ref_fa.fetch.call_count == 2 ref_fa.fetch.assert_has_calls([call('chr2', 10, 12), call('chr2', 15, 16)])
def test_with_hardclip_spanning_clv_from_before_edgecase_3(mock_apautils): """ TT └GTGA <-bridge read GTGA <-bridge contig (hardcipped), chimeric / <-hardclip mask 012345678 <-contig coord ^icb/cc ...GTGA... 0123 <-genome coordinate |1 ^irb/rc """ ctg = MagicMock() ctg.reference_name = 'chr2' mock_apautils.infer_query_sequence.return_value = 'GTGA' ctg.cigartuples = ( (S.BAM_CHARD_CLIP, 1), (S.BAM_CMATCH, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='-', ref_clv=0, ref_fa=ref_fa, ctg_clv=0) assert extract_seq(**kw) == 'GTGA'
def test_with_three_skipped_region_and_mismatches_for_plus_strand_clv(): """ AA <-tail of suffix contig A---CC-GTA--GC┘| <-suffix contig with skip 0|||12|345||678 <-contig coord |||x | x || | |||x ctg_clv^ ^init_ctg_idx <-contig coord ...ACTGTCAGAATTGCX... <-genome 789012345678901 <-genome coord 1 |2| ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr3' ctg.query_sequence = 'ACCGTAGCAA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 3), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 2), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.side_effect = ['TT', 'A', 'CTG'] kw = dict(contig=ctg, strand='+', ref_clv=20, ref_fa=ref_fa, ctg_clv=7) assert extract_seq(**kw) == 'ACTGCCAGTATTGC' assert ref_fa.fetch.call_count == 3 ref_fa.fetch.assert_has_calls( [call('chr3', 17, 19), call('chr3', 13, 14), call('chr3', 8, 11)]) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.side_effect = ['TT', 'A'] kw.update(ref_fa=ref_fa) assert extract_seq(window=9, **kw) == 'CAGTATTGC' assert ref_fa.fetch.call_count == 2
def test_extract_seq_for_minus_strand_clv_supported_by_link(): """ ACATC <-link contig 01234 <-contig coord ...89012... <-genome coord ^ref_clv """ strand = '-' ref_clv = 8 ctg_clv = 0 ref_fa = MagicMock() contig = MagicMock() contig.query_sequence = 'ACATCG' contig.cigartuples = ((S.BAM_CMATCH, 6), ) args = contig, strand, ref_clv, ref_fa, ctg_clv assert extract_seq(*args) == 'ACATCG' assert extract_seq(*args, window=1) == 'A' assert extract_seq(*args, window=2) == 'AC' assert extract_seq(*args, window=3) == 'ACA' assert extract_seq(*args, window=4) == 'ACAT'
def test_hardclip_before_clv(mock_apautils): """ AA TC┘ <-bridge read CGCATTCGTCG <-bridge contig (hardcipped, could be chimeric https://www.biostars.org/p/109333/) \\\| | <-hardclip mask 012345678901 <-contig coord |cc^ ^ice ...XXXATTCGTCG... <-genome 234567890123 <-genome coord | 1 | rc^ ^ire """ ctg = MagicMock() ctg.reference_name = 'chr2' mock_apautils.infer_query_sequence.return_value = 'CGCATTCGTCG' ctg.cigartuples = ((S.BAM_CHARD_CLIP, 3), (S.BAM_CMATCH, 8)) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 kw = dict(contig=ctg, strand='+', ref_clv=8, ref_fa=ref_fa, ctg_clv=6) assert extract_seq(**kw) == 'CGCATTC' assert extract_seq(window=1, **kw) == 'C' assert extract_seq(window=2, **kw) == 'TC' assert extract_seq(window=3, **kw) == 'TTC' assert extract_seq(window=4, **kw) == 'ATTC' assert extract_seq(window=5, **kw) == 'CATTC'
def test_hardclip_after_clv(mock_apautils): """ AAA GTT┘ <-bridge read A-GGTTGCAGA <-bridge contig | | | |/// <-hardclip mask 0 1234567890 <-contig coord ctg_clv^ ^ice <-contig coord ...ACGGTTGCAGA... <-genome 789012345678 <-genome coord 1 | | ref_clv^ ^init_fe """ ctg = MagicMock() ctg.reference_name = 'chr1' mock_apautils.infer_query_sequence.return_value = 'AGGTTGCAGA' ctg.cigartuples = ((S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 6), (S.BAM_CHARD_CLIP, 3)) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch = MagicMock(return_value='C') kw = dict(contig=ctg, strand='+', ref_clv=12, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'ACGGTT' ref_fa.fetch.assert_called_with('chr1', 8, 9) assert extract_seq(window=1, **kw) == 'T' assert extract_seq(window=2, **kw) == 'TT' assert extract_seq(window=3, **kw) == 'GTT' assert extract_seq(window=4, **kw) == 'GGTT' assert extract_seq(window=5, **kw) == 'CGGTT'
def test_extract_seq_for_plus_strand_clv_supported_by_link(): """ ATCGAC <-link contig 012345 <-contig coord ^ctg_clv ...789012... <-genome coord 1 ^ref_clv """ strand = '+' ref_clv = 12 ctg_clv = 5 ref_fa = MagicMock() contig = MagicMock() contig.query_sequence = 'ATCGAC' contig.cigartuples = ((S.BAM_CMATCH, 6), ) args = contig, strand, ref_clv, ref_fa, ctg_clv assert extract_seq(*args) == 'ATCGAC' assert extract_seq(*args, window=1) == 'C' assert extract_seq(*args, window=2) == 'AC' assert extract_seq(*args, window=3) == 'GAC' assert extract_seq(*args, window=4) == 'CGAC'
def test_with_two_skipped_region_for_plus_strand_clv(): """ AAA <-tail of suffix contig A-TT--GC┘|| <-suffix contig with skip 0 12 345678 <-contig coord | ctg_clv^ ^init_ctg_idx <-contig coord ...ACTTAAGCGGT... <-genome 789012345678 <-genome coord 1 | | ref_clv^ ^init_ref_idx """ ctg = MagicMock() ctg.reference_name = 'chr3' ctg.query_sequence = 'ATTGCAAA' ctg.cigartuples = ( (S.BAM_CMATCH, 1), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 2), (S.BAM_CSOFT_CLIP, 3), ) ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.side_effect = ['AA', 'C'] kw = dict(contig=ctg, strand='+', ref_clv=14, ref_fa=ref_fa, ctg_clv=4) assert extract_seq(**kw) == 'ACTTAAGC' assert ref_fa.fetch.call_count == 2 ref_fa.fetch.assert_has_calls([call('chr3', 11, 13), call('chr3', 8, 9)]) # use a new mock, couldn't make ref_fa.reset_mock() work ref_fa = MagicMock() ref_fa.get_reference_length.return_value = 100 ref_fa.fetch.return_value = 'AA' kw.update(ref_fa=ref_fa) assert extract_seq(window=3, **kw) == 'AGC' assert ref_fa.fetch.call_count == 1
def test_extract_seq_for_plus_strand_clv_supported_by_suffix(): """ AA <-tail of suffix contig ATCGAC┘ <-suffix contig 012345 <-contig coord ...789012... <-genome coord ^ref_clv """ strand = '+' ctg_clv = 5 ref_clv = 12 ref_fa = MagicMock() contig = MagicMock() contig.query_sequence = 'ATCGACAA' contig.cigartuples = ((S.BAM_CMATCH, 6), (S.BAM_CSOFT_CLIP, 2)) args = contig, strand, ref_clv, ref_fa, ctg_clv assert extract_seq(*args) == 'ATCGAC' assert extract_seq(*args, window=1) == 'C' assert extract_seq(*args, window=2) == 'AC' assert extract_seq(*args, window=3) == 'GAC' assert extract_seq(*args, window=4) == 'CGAC'
def gen_hex_tuple(contig, strand, ref_clv, ref_fa, ctg_clv, dd_bridge): # TODO: the returning of None is pretty ugly, to refactor seqname = contig.reference_name clv_key = apautils.gen_clv_key_tuple_with_ctg_clv(seqname, strand, ref_clv, ctg_clv) if dd_bridge['hexamer_tuple'][clv_key] is None: # do search hex_src_seq = extract_seq(contig, strand, ref_clv, ref_fa, ctg_clv) ctg_hex_tuple = search(strand, ref_clv, hex_src_seq) if ctg_hex_tuple is None: ctg_hex_tuple = ('NA', -1, -1) else: ctg_hex_tuple = None return ctg_hex_tuple
def test_extract_seq_with_ending_softclip(self): """ AA GGG┘| 012345 <-contig coord 234567 <-genome coord """ contig = MagicMock() contig.query_sequence = 'GGGAA' contig.cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 2)) assert extract_seq(contig, strand='+', ref_clv=7, ref_fa=MagicMock(), ctg_clv=2) == 'GGG'
def test_extract_seq_with_ending_softclip(self, mock_apautils): """ AA GGG┘| 012345 <-contig coord 234567 <-genome coord """ contig = MagicMock() mock_apautils.infer_query_sequence.return_value = 'GGGAA' contig.cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CHARD_CLIP, 2)) assert extract_seq(contig, strand='+', ref_clv=7, ref_fa=MagicMock(), ctg_clv=2) == 'GGG'
def test_extract_seq_with_starting_softclip(self): """ TT └CAA 012345 <-contig coord 678901 <-genome coord """ strand = '-' contig = MagicMock() contig.query_sequence = 'TTCCA' contig.cigartuples = ((S.BAM_CSOFT_CLIP, 2), (S.BAM_CMATCH, 4)) assert extract_seq(contig, strand, ref_clv=8, ref_fa=MagicMock(), ctg_clv=2) == 'CCA'
def test_extract_seq_with_starting_softclip(self, mock_apautils): """ TT └CAA 012345 <-contig coord 678901 <-genome coord """ strand = '-' contig = MagicMock() mock_apautils.infer_query_sequence.return_value = 'TTCCA' contig.cigartuples = ((S.BAM_CHARD_CLIP, 2), (S.BAM_CMATCH, 4)) assert extract_seq(contig, strand, ref_clv=8, ref_fa=MagicMock(), ctg_clv=2) == 'CCA'