def test_for_skip_check_sizes(): """ TT └A--T <-bridge read GTA--AT <-bridge contig 012--345 <-contig coord ^ctg_clv 5432--10 <-rev contig coord | | 76543210 <-rev genome offset coord 01234567 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 2), ] assert calc_genome_offset(cigartuples, ctg_clv=2, tail_side='left', skip_check_size=1) == 5 assert calc_genome_offset(cigartuples, ctg_clv=2, tail_side='left') == 2
def test_for_a_case_derived_from_a_real_one_E1_L_4362_chr16_plus_strand(): """ mostly just the bases around the clv are copied, the coordinates are arbitray AA AAG-----G┘ AAAA <-bridge read AAG-----GCTT┘ <-bridge contig 012 34567 <-contig coord ^ctg_clv AAGXXXXXGCTT 0123456789012 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 5), (S.BAM_CMATCH, 4), (S.BAM_CSOFT_CLIP, 4), ] gnm_offset = 8 assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=1) == 2
def analyze_bridge(contig, read, ref_fa, dd_bridge, bridge_skip_check_size): """ :param dd_bridge: holds bridge_evidence for a given contig, here it's just used to check if hexamer_search has already been done for a given ref_clv """ seqname = contig.reference_name bdg_support = do_bridge(contig, read) if bdg_support is None: # likely a chimeric contig return strand, ctg_clv, tail_len, tail_direction = bdg_support offset = calc_genome_offset(contig.cigartuples, ctg_clv, tail_direction, skip_check_size=3) if offset < 0: # meaning the clv is on soft/hard clipped region return ref_clv = contig.reference_start + offset ctg_hex_tuple = gen_hex_tuple(contig, strand, ref_clv, ref_fa, ctg_clv, dd_bridge) return seqname, strand, ref_clv, ctg_clv, tail_len, ctg_hex_tuple
def test_for_nonskipped_contig(ctg_cigartuples, ctg_clv, gnm_offset): """ TT └AC <-bridge read AACG <-bridge contig 01234 <-contig offset coord: different from "contig coord", it doesn't consider clipped regions ^ctg_clv 01234 <-genome offset coord ^gnm_offset """ assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
def test_for_skipped_with_clv_before_a_skip(ctg_cigartuples, ctg_clv, gnm_offset): """ TT └AC <-bridge read AACGTA--ATCG <-bridge contig 012345 67890 <-contig offset coord ^ 0123456789012 <-genome offset coord ^ctf/gnm_offset """ assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
def test_for_skipped_contig_with_clv_after_a_skip(ctg_cigartuples, ctg_clv, gnm_offset): """ TT |└AC <-bridge read CG--ATCGAT <-bridge contig 01 2345678 <-contig offset coord ^ctg_clv 01234567890 <-genome offset coord ^gnm_offset """ assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
def test_for_contig_with_hardclip(ctg_clv, expected_gnm_offset): """ TT └ACG \\ACGT <-contig 0123456 <-contig offset coord ^ctg_clv 0123 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CHARD_CLIP, 2), (S.BAM_CMATCH, 3)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == expected_gnm_offset
def test_for_contig_with_softclip(ctg_clv, expected_gnm_offset): """ TTT 012 <-contig offset coord for tail └XXX <-contig 345 <-contig offset coord ^ctg_clv 012 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CSOFT_CLIP, 3), (S.BAM_CMATCH, 3)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == expected_gnm_offset
def test_for_a_long_contig_with_deletion(ctg_cigartuples, ctg_clv, gnm_offset): """ TT TT TT |└TC └C └TC <-bridge read ATCGATCGATCGATCGATCGATCGATCGATC__ATCGATCGATCGATCGATCGATCGATCGATCGATCGATCGATCG <-bridge contig 0123456789012345678901234567890 123456789012345678901234567890123456789012345 <-bridge offset coord | 1 2 3 | 4 5 6 7 ^ctg_clv (case2)^ ^ctg_clv (case3) 012345678901234567890123456789012345678901234567890123456789012345678901234567 <-genome offset coord | 1 2 3 | 4 5 6 7 ^gnm_offset(case1) ^gnm_offset(case3) """ assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == gnm_offset
def test_for_contig_with_two_skips_and_soft_clip_edge_case_2(): """ AA C--G┘ AAAA <-bridge read CG--ATC--GAT┘ <-bridge contig 01 234 56789012 <-contig offset coord ^ctg_clv 0123456789012 <-genome offset coord ^gnm_offset(check the passing for introns) """ cigartuples = [ (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 4), ] gnm_offset = 9 assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right') == gnm_offset assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right', skip_check_size=1) == 6 assert calc_genome_offset(cigartuples, ctg_clv=5, tail_side='right', skip_check_size=2) == 6
def test_for_contig_with_two_skips_and_soft_clip_with_clv_before_the_skip(): """ AA AT┘ AAAA <-bridge read CG--ATC--GAT┘ <-bridge contig 01 234 5678901 <-contig offset coord ^ctg_clv 0123456789012 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 4), ] gnm_offset = 5 assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=1) == 5 assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right', skip_check_size=2) == 1
def test_for_a_case_derived_from_a_real_one_E1_L_4362_chr16_plus_strand_v2(): """ basically the same as above, much a bit more bases are included on both sides AA AATAAAG-----G┘ AAAA <-bridge read AATAAAG-----GCTTGGA┘ <-bridge contig 0123456 78901234 <-contig coord | ^ctg_clv AATAAAGXXXXXGCTTGGA 01234567890123456789 <-genome offset coord 1 ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 7), (S.BAM_CREF_SKIP, 5), (S.BAM_CMATCH, 7), (S.BAM_CSOFT_CLIP, 4), ] gnm_offset = 12 assert calc_genome_offset(cigartuples, ctg_clv=7, tail_side='right') == gnm_offset assert calc_genome_offset(cigartuples, ctg_clv=7, tail_side='right', skip_check_size=1) == 6
def test_for_contig_with_clv_in_hardclip_spanning_clv(): """ TT └ACG CGACGTA <-contig \\\ | <-hardclip mask 01234567 <-contig coord 76543210 <-rev contig coord ^ctg_clv |01234 <-genome offset coord 76543210 <-rev genome offset coord ^gnm_offset """ ctg_cigartuples = ( (S.BAM_CHARD_CLIP, 3), (S.BAM_CMATCH, 4), ) expected_gnm_offset = -1 assert calc_genome_offset(ctg_cigartuples, ctg_clv=2, tail_side='left') == expected_gnm_offset # just shift ctg_clv to the left a bit more assert calc_genome_offset(ctg_cigartuples, ctg_clv=1, tail_side='left') == -2
def init_ref_beg(ref_clv, cigartuples, ctg_clv): """ Initialize the beginning index in genome coordinate by calculating the offset from left, using `calc_genome_offset` """ # TODO: left may not matter in such case offset = calc_genome_offset(cigartuples, ctg_clv, 'left') # ref_clv = contig.reference_start + offset, so # ref_clv - offset = contig.reference_start, # so still needs to subtract clipped region, which isn't in reference_start cgr = cigartuples[0] if cgr[0] == S.BAM_CSOFT_CLIP or cgr[0] == S.BAM_CHARD_CLIP: offset += cgr[1] return ref_clv - offset
def test_for_skipped_contig_with_clv_right_before_the_skip(): """ AA AT┘ <-bridge read CGAT--CGAT <-bridge contig 0123 45678 <-contig offset coord ^ctg_clv 01234567890 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 4), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 4) ] gnm_offset = 3 assert calc_genome_offset(cigartuples, ctg_clv=3, tail_side='right') == gnm_offset
def test_for_bridge_support_on_suffix_contig_edgecase(): """ AA CGTACT┘| <-bridge read 012345678 |||AAA | ATGACGT┘ | | <-suffix contig 0123456789012 <-contig offset coord | ^ctg_clv 0123456789012 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 7), (S.BAM_CSOFT_CLIP, 3)) expected_gnm_offset = 9 assert calc_genome_offset(ctg_cigartuples, ctg_clv=9, tail_side='right') == expected_gnm_offset
def test_skip_check(): """ AA G--A┘ <-bridge read CG--ATC <-bridge contig 01 2345 <-contig offset coord ^ctg_clv 01234567 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), ] gnm_offset = 1 assert calc_genome_offset(cigartuples, ctg_clv=2, tail_side='right', skip_check_size=1) == gnm_offset
def test_for_contig_with_clv_after_insertion_so_tail_side_does_not_matter( ctg_clv, tail_side, expected_gnm_offset): """ TT AA <-bridge read tail └A┘ <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown | # blank line to separate the bridge read the insertion AGC <-inserted sequence 456 <-contig offset coord for inserted sequence ┬ ATCG GT <-contig 0123 789 <-contig offset coord ^ctg_clv 0123 456 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, tail_side) == expected_gnm_offset
def test_for_contig_with_clv_before_insertion_so_tail_side_does_not_matter( ctg_clv, tail_side, expected_gnm_offset): """ TT AA <-bridge read tail └C┘ <-bridge read | # blank line to separate the bridge read the insertion |AGC <-inserted sequence |456 <-contig offset coord for inserted sequence | ┬ ATCG GT <-contig 0123 789 <-contig offset coord ^ctg_clv 0123 456 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, tail_side) == expected_gnm_offset
def test_for_contig_with_3_base_insertion_with_clv_after_the_insertion( ctg_clv, expected_gnm_offset): """ TT <-bridge read tail └GT <-bridge read | # blank line to separate the bridge read the insertion AGC| <-inserted sequence 456| <-contig offset coord for inserted sequence ┬ | ATCG GT <-contig 0123 78 <-contig offset coord ^ctg_clv 0123 45 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 3), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == expected_gnm_offset
def test_for_contig_with_3_base_insertion_with_clv_inside_the_insertion_and_clv_is_in_the_middle_of_insertion( ctg_clv, expected_gnm_offset): """ TT <-bridge read tail └G <-bridge read | # blank line to separate the bridge read the insertion AGC <-inserted sequence 456 <-contig offset coord for inserted sequence ctg_clv^| |┬ ATCG GT <-contig 0123 789 <-contig offset coord 0123 456 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == expected_gnm_offset
def test_for_contig_with_1_base_insertion_with_clv_after_insertion( ctg_clv, expected_gnm_offset): """ TT └AC <-bridge read | # blank line to separate the bridge read the insertion G| <-inserted sequence 4| <-contig offset coord for inserted sequence ┬| ATCG AC <-contig 0123 56 <-contig offset coord ^ctg_clv 0123 45 <-genome offset coord ^gnm_offset see parameters in the decorator for various ctg_clv """ ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 1), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, 'left') == expected_gnm_offset
def test_for_contig_with_one_base_insertion_with_clv_before_insertion( ctg_clv, tail_side, expected_gnm_offset): """ TT AA └C┘ <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown | # blank line to separate the bridge read the insertion | G <-inserted sequence | 4 <-contig offset coord for inserted sequence | ┬ ATCG AC <-contig 0123 56 <-contig offset coord ^ctg_clv 0123 45 <-genome offset coord ^gnm_offset see parameters in the decorator for various ctg_clv """ ctg_cigartuples = ((S.BAM_CMATCH, 4), (S.BAM_CINS, 1), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, tail_side) == expected_gnm_offset
def test_clv_inside_insertion(ctg_clv, tail_side, skip_check_size, expected_gnm_offset): """ TT AA <-bridge read tail └A┘ <-bridge read, for visual convenience two cases for different tail sides are merged with only one base shown | # blank line to separate the bridge read the insertion AGC <-inserted sequence 345 <-contig offset coord for inserted sequence ctg_clv^| |┬ AT-G GT <-contig 01 2 678 <-contig offset coord 0123 456 <-genome offset coord ^gnm_offset """ ctg_cigartuples = ((S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 1), (S.BAM_CMATCH, 1), (S.BAM_CINS, 3), (S.BAM_CMATCH, 2)) assert calc_genome_offset(ctg_cigartuples, ctg_clv, tail_side, skip_check_size) == expected_gnm_offset
def test_for_contig_with_two_skips_with_clv_right_before_the_skip(): """ AA TC┘ <-bridge read CG--ATC--GAT <-bridge contig 01 234 5678 <-contig offset coord ^ctg_clv 0123456789012 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), ] gnm_offset = 6 assert calc_genome_offset(cigartuples, ctg_clv=4, tail_side='right') == gnm_offset
def test_for_contig_with_two_skips_and_soft_clip_edge_case_1(): """ AA TC┘ AAAA <-bridge read CG--ATC--GAT┘ <-bridge contig 01 234 56789012 <-contig offset coord ^ctg_clv 0123456789012 <-genome offset coord ^gnm_offset """ cigartuples = [ (S.BAM_CMATCH, 2), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CREF_SKIP, 2), (S.BAM_CMATCH, 3), (S.BAM_CSOFT_CLIP, 4), ] gnm_offset = 6 assert calc_genome_offset(cigartuples, ctg_clv=4, tail_side='right') == gnm_offset
def init_ref_end(ref_clv, cigartuples, ctg_clv, ctg_seq): """ Initialize the end index in genome coordinate by calculating the offset from right, using `calc_genome_offset` comparing to the minus corresponding function, there is one additional argument neeed, i.e. `ctg_seq` :param ctg_seq: should include soft/hardclipped region if it's clipped """ cigartuples = list(reversed(cigartuples)) ctg_clv = len(ctg_seq) - ctg_clv # from_the_right # TODO: left may not matter in such case offset = calc_genome_offset(cigartuples, ctg_clv, 'left') # note ref_clv = contig.reference_start + offset cgr = cigartuples[0] if cgr[0] == S.BAM_CSOFT_CLIP or cgr[0] == S.BAM_CHARD_CLIP: offset += cgr[1] return ref_clv + offset