def test_pos_neg_strand(self): """ positive, negative strnad len %2 == 0 na negativnem stnadu and not %2 """ bam_fname = make_bam_file({ 'chromosomes': [('chr1', 3000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('_:rbc:AAA', 16, 0, 50, 255, [(0, 100)], { 'NH': 1 }), ('_:rbc:CCC', 0, 0, 50, 255, [(0, 101)], { 'NH': 1 }), ], }) grouped = xlsites._processs_bam_file(bam_fname, self.metrics, 10, self.tmp) expected = { ('chr1', '-'): { 150: { 'AAA': [(99, 50, 100, 1, 0)] } }, ('chr1', '+'): { 49: { 'CCC': [(100, 150, 101, 1, 0)] } }, } self.assertEqual(grouped, expected)
def test_run_simple(self): bam_fname = make_bam_file(self.data) unique_fname = get_temp_file_name(extension='.bed.gz') multi_fname = get_temp_file_name(extension='.bed.gz') strange_fname = get_temp_file_name(extension='.bam.gz') result = xlsites.run(bam_fname, unique_fname, multi_fname, strange_fname, mapq_th=5, report_progress=True) # pylint: disable=no-member self.assertEqual(result.all_recs, 6) # Unmapped records: self.assertEqual(result.notmapped_recs, 1) # Mapped records: self.assertEqual(result.mapped_recs, 5) # Records with too low quality: self.assertEqual(result.lowmapq_recs, 1) # Records used in analysis self.assertEqual(result.used_recs, 4) # Records with invalid randomers self.assertEqual(result.invalidrandomer_recs, 1) # Records with no randomers: self.assertEqual(result.norandomer_recs, 1) # Barcode counter: self.assertEqual(result.bc_cn, {'': 2, 'ACG': 1, 'CCCC': 1}) # Strange counter: self.assertEqual(result.strange_recs, 1)
def test_explicit_whole_in(self): """ Whole read is in single transcript and is crossing the exon-intron landmark (it is explicit). Provide three reads, with two different cross-links. One cross-link has two distinct randomers. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 140, 255, [(0, 50)], { 'NH': 1 }), ('name2:rbc:AAAA', 0, 0, 142, 255, [(0, 50)], { 'NH': 1 }), ('name2:rbc:CCCC', 0, 0, 142, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['UTR5-intron', '-10', '1', '1'], ['UTR5-intron', '-8', '2', '2'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_intergenic(self): """ Whole read is in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 530, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '30', '0.5', '0'], ['intergenic-CDS', '-70', '0.5', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_negative_strand(self): """ Whole read is in single transcript, single segment. But the segment borders on intergenic (downstream). """ gtf_neg_data = [ i[:6] + ['-'] + i[7:] for i in intervals_to_list(self.gtf_data) ] gtf_neg = make_file_from_list(gtf_neg_data) bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 16, 0, 549, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '20', '0.5', '0'], ['intergenic-CDS', '-80', '0.5', '0'], ] rnamaps.run(bam, gtf_neg, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_whole_in(self): """ Whole read is in single transcript and in single segment. Also, this segment is the "middle" segment in transcript. Provide three reads, with two different cross-links. One cross-link has two distinct randomers. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 160, 255, [(0, 30)], { 'NH': 1 }), ('name2:rbc:CCCC', 0, 0, 163, 255, [(0, 30)], { 'NH': 1 }), ('name2:rbc:GGGG', 0, 0, 163, 255, [(0, 30)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['UTR5-intron', '10', '1', '0'], ['UTR5-intron', '13', '2', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_implicit_exons(self): """ Whole read is in single transcript and in single segment. Also, this segment is of EXON_TYPE in the "middle" segment in transcript. Only one read. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 205, 255, [(0, 20)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-UTR3', '-25', '0.25', '0'], ['CDS-intron', '-25', '0.25', '0'], ['UTR5-CDS', '5', '0.25', '0'], ['intron-CDS', '5', '0.25', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_explicit_intergenic_right(self): """ Read is half in transcript region and half in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 480, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-intergenic', '-20', '1', '1'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.out))
def test_cross_transcript_read(self): """ Read is half in transcript region and half in intergenic. """ bam = make_bam_file({ 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 235, 255, [(0, 50)], { 'NH': 1 }), ] }) expected = [ [ 'chrom', 'strand', 'xlink', 'second-start', 'end-position', 'read_len' ], ['1', '+', '234', '0', '284', '50'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1) self.assertEqual(expected, make_list_from_file(self.cross_tr))
def test_implicit_inter_tr(self): """ Whole read is in single transcript, single segment. But the segment borders on intergenic (downstream). """ bam = make_bam_file( { 'chromosomes': [('1', 1000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('name2:rbc:CCCC', 0, 0, 610, 255, [(0, 30)], { 'NH': 1 }), ] }, rnd_seed=0) expected = [ ['RNAmap', 'type', 'position', 'all', 'explicit'], ['CDS-CDS', '-40', '0.3333', '0'], ['CDS-intron', '-40', '0.3333', '0'], ['intergenic-CDS', '10', '0.3333', '0'], ] rnamaps.run(bam, self.gtf, self.out, self.strange, self.cross_tr, mismatches=1, implicit_handling='split') self.assertEqual(expected, make_list_from_file(self.out))
def test_diff_barcodes(self): """ Different barcodes on same position. """ bam_fname = make_bam_file({ 'chromosomes': [('chr1', 3000)], 'segments': [ # (qname, flag, refname, pos, mapq, cigar, tags) ('_:rbc:AAA', 0, 0, 50, 255, [(0, 101)], {'NH': 1}), ('_:rbc:CCC', 0, 0, 50, 255, [(0, 101)], {'NH': 1}), ('_:rbc:CCC', 0, 0, 50, 255, [(0, 101)], {'NH': 1}), ('_:rbc:GGG', 0, 0, 50, 255, [(0, 101)], {'NH': 1}), ], }, rnd_seed=0) grouped = list(xlsites._processs_bam_file(bam_fname, self.metrics, 10, self.tmp)) expected = [ (('chr1', '+'), 0.0167, { 49: { 'AAA': [(100, 150, 101, 1, 0)], 'CCC': [(100, 150, 101, 1, 0), (100, 150, 101, 1, 0)], 'GGG': [(100, 150, 101, 1, 0)], } }), ] self.assertEqual(grouped, expected)
def setUp(self): warnings.simplefilter("ignore", ResourceWarning) # Temporary file names to use for output: self.tmp1 = get_temp_file_name() self.tmp2 = get_temp_file_name() self.dir = get_temp_dir() self.dir2 = get_temp_dir() self.cross_links = make_file_from_list([ ['1', '16', '17', '.', '5', '+'], ['1', '14', '15', '.', '5', '+'], ['1', '15', '16', '.', '5', '+'], ], extension='bed') self.peaks = make_file_from_list([ ['1', '15', '16', '.', '15', '+'], ]) self.annotation = make_file_from_list([ ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'ncRNA', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '10', '20', '.', '+', '.', 'biotype "B";'], ['1', '.', 'CDS', '10', '20', '.', '-', '.', 'biotype "C";'], ['1', '.', 'CDS', '12', '18', '.', '+', '.', 'biotype "A";'], ['1', '.', 'CDS', '30', '40', '.', '+', '.', 'biotype "D";'], ]) self.gtf = make_file_from_list([ ['1', '.', 'gene', '10', '20', '.', '+', '.', 'gene_id "A";'], [ '1', '.', 'transcript', '10', '20', '.', '+', '.', 'gene_id "A"; transcript_id "AA";' ], [ '1', '.', 'exon', '10', '20', '.', '+', '.', 'gene_id "A"; transcript_id "AA"; exon_number "1";' ], ]) self.bam = make_bam_file( { 'chromosomes': [ ('1', 3000), ('2', 2000), ], 'segments': [ ('name3:rbc:CCCC:', 0, 0, 100, 20, [(0, 100)], { 'NH': 1 }), ('name4:ABC', 0, 0, 300, 20, [(0, 200)], { 'NH': 11 }), ] }, rnd_seed=0)
def test_no_nh_tag(self): data_no_nh = { 'chromosomes': [('chr1', 3000)], 'segments': [ # No NH tag is set ('name5', 0, 0, 0, 50, [(0, 100)], {})]} bam_fname = make_bam_file(data_no_nh, rnd_seed=0) message = r'"NH" tag not set for record: .*' with self.assertRaisesRegex(ValueError, message): list(xlsites._processs_bam_file(bam_fname, self.metrics, 10, self.tmp))
def test_low_quality(self): """ Unmapped read (FLAG=4): """ bam_fname = make_bam_file({ 'chromosomes': [('chr1', 3000)], 'segments': [('name1', 0, 0, 0, 3, [(0, 0)], {})], }) self.metrics.lowmapq_recs = 0 self.metrics.used_recs = 0 xlsites._processs_bam_file(bam_fname, self.metrics, 10, self.tmp) self.assertEqual(self.metrics.lowmapq_recs, 1) self.assertEqual(self.metrics.used_recs, 0)
def test_unmapped(self): """ Unmapped read (FLAG=4): """ bam_fname = make_bam_file({ 'chromosomes': [('chr1', 3000)], 'segments': [('name1', 4, 0, 0, 0, [(0, 0)], {})], }, rnd_seed=0) self.metrics.all_recs = 0 self.metrics.notmapped_recs = 0 self.metrics.used_recs = 0 list(xlsites._processs_bam_file(bam_fname, self.metrics, 0, self.tmp)) self.assertEqual(self.metrics.notmapped_recs, 1) self.assertEqual(self.metrics.all_recs, 1) self.assertEqual(self.metrics.used_recs, 0)