class TestHeader:
    """Tests for class Header"""
    @pytest.mark.parametrize(
        "header, tuple_",
        [(loci.Header(
            reference='chr1', strand='-', category='family', source='bam'),
          ('chr1', '-', 'family', 'bam')),
         (loci.Header(reference='chr1', strand='-'), ('chr1', '-'))])
    def test_tuple(self, header, tuple_):
        assert header.tuple == tuple_

    @pytest.mark.parametrize(
        "header, dtype", [(loci.Header(
            reference='chr1', strand='-', category='family', source='bam'),
                           (np.dtype([('reference', 'O'), ('strand', '<U1'),
                                      ('category', 'O'), ('source', 'O')]))),
                          (loci.Header(reference='chr1', strand='-'),
                           (np.dtype([('reference', 'O'),
                                      ('strand', '<U1')])))])
    def test_dtype(self, header, dtype):
        assert header.dtype == dtype

    def test_mutate(self):
        origional = loci.Header(reference='chr1',
                                strand='-',
                                category=None,
                                source='bam')

        answer = loci.Header(reference='chr1',
                             strand='.',
                             category='family',
                             source=None)

        assert origional.mutate(strand='.', category='family',
                                source=None) == answer
    def test_map(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))
        contig_alt_1 = loci.Contig(
            header_1,
            np.array([(101, 'gypsy1'), (107, 'gypsy4')], dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))
        contig_alt_2 = loci.Contig(
            header_2,
            np.array([(103, 'gypsy7'), (109, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        def func(contig):
            """dummy function that adds 100 to contig loci 'tip's"""
            array = np.copy(contig.loci)
            array['tip'] += 100
            return loci.Contig(contig.header, array)

        query = query.map(func)

        answer = loci.ContigSet(contig_alt_1, contig_alt_2)

        assert query == answer
def test_append_header_miss_match():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    query_1 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=loci_dtype))

    query_2 = loci.Contig(
        loci.Header(
            reference='chr2',  # miss-matched
            strand='-',
            category='family',
            source='bam'),
        np.array([(9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    try:
        loci.append(query_1, query_2)
    except ValueError:
        pass
    else:
        assert False
def test_create_contig_ids():
    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64),
                                 ('element', 'O')])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1'), (7, 9, 'gypsy4')],
                     dtype=dtype_loci_query)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7'), (9, 12, 'gypsy1')],
                     dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([('start', np.int64), ('stop', np.int64),
                                  ('element', 'O'), ('ID', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1', 'gypsy_chr1_+_5'),
                      (7, 9, 'gypsy4', 'gypsy_chr1_+_9')],
                     dtype=dtype_loci_answer)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7', 'gypsy_chr1_-_3'),
                      (9, 12, 'gypsy1', 'gypsy_chr1_-_9')],
                     dtype=dtype_loci_answer)))

    assert query.map(fingerprint.create_contig_ids) == answer
def test_append():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    query_1 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=loci_dtype))

    query_2 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    answer = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3'),
                  (9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    assert loci.append(query_1, query_2) == answer
def test_extract_gff_intervals():
    gff = DATA_PATH + 'testAnnotation-2017-11-27.gff'

    query = fingerprintio.extract_gff_intervals(gff, 'chr1',
                                                ['Gypsy', 'Copia'])

    dtype_loci = np.dtype([('start', np.int64), ('stop', np.int64),
                           ('element', '<O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Gypsy',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(3150, 3200, 'Gypsy-21_ClassI;chr1:3150-3200'),
                      (24250, 24700, 'Gypsy-21_ClassI;chr1:24250-24700')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Copia',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(98260, 98322, 'Copia-10_ClassI;chr1:98260-98322')],
                     dtype=dtype_loci)))

    assert query == answer
def test_count_reads_n2():
    dtype_loci_reads = np.dtype([('tip', np.int64), ('element', 'O')])

    reads = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam1'),
            np.array([(2, 'gypsy1'), (4, 'gypsy1'), (5, 'gypsy4'),
                      (7, 'gypsy4'), (7, 'gypsy7'), (7, 'gypsy1'),
                      (8, 'gypsy1'), (8, 'gypsy1')],
                     dtype=dtype_loci_reads)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam2'),
            np.array([(3, 'gypsy1'), (4, 'gypsy1'), (6, 'gypsy1'),
                      (7, 'gypsy1'), (7, 'gypsy1'), (7, 'gypsy1'),
                      (7, 'gypsy1'), (50, 'gypsy7')],
                     dtype=dtype_loci_reads)))

    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 15), (30, 60)], dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([
        ('start', np.int64), ('stop', np.int64), ('median', np.int64),
        ('sample', [('0', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])]),
                    ('1', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])])])
    ])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(2, 8, 7, (('bam1', 8, (('gypsy1', 5), ('gypsy4', 2))),
                                 ('bam2', 7, (('gypsy1', 7), ('.', 0))))),
                      (50, 50, 50, (('bam1', 0, (('.', 0), ('.', 0))),
                                    ('bam2', 1, (('gypsy7', 1), ('.', 0)))))],
                     dtype=dtype_loci_answer)))

    assert fingerprint.count_reads(query,
                                   reads,
                                   trim=True,
                                   n_common_elements=2) == answer
    def test_mutate(self):
        origional = loci.Header(reference='chr1',
                                strand='-',
                                category=None,
                                source='bam')

        answer = loci.Header(reference='chr1',
                             strand='.',
                             category='family',
                             source=None)

        assert origional.mutate(strand='.', category='family',
                                source=None) == answer
    def test_add_append_headers(self):
        """Contigs with same header should be appended"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1)
        query.add(contig_2, append_duplicate_headers=True)

        assert len(query) == 4
        assert len(list(query.contigs())) == 1
        assert len(query.headers()) == 1

        query_loci = list(query.contigs())[0].loci

        answer_loci = np.array([(1, 'gypsy1'), (7, 'gypsy4'), (3, 'gypsy7'),
                                (9, 'gypsy1')],
                               dtype=dtype_loci)

        npt.assert_array_equal(query_loci, answer_loci)
def test_drop_field():
    query = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=np.dtype([('tip', np.int64), ('element', 'O')])))

    answer = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'), np.array([5, 1, 7]))
    answer.loci = np.array(answer.loci, np.dtype([('tip', np.int64)]))

    assert loci.drop_field(query, 'element') == answer
    def test_headers(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = set(loci.ContigSet(contig_1, contig_2).headers())
        answer = {header_1, header_2}

        assert query == answer
    def test_dtype_loci(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        assert query.dtype_loci() == contig_1.loci.dtype
        assert query.dtype_loci() == contig_1.loci.dtype
    def test_init_different_headers(self):
        """"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        assert len(query) == 4
        assert len(list(query.contigs())) == 2
        assert len(query.headers()) == 2
def test_cluster():
    header = loci.Header(reference='chr1',
                         strand='-',
                         category='Gypsy',
                         source='bam')
    query_loci = np.array([(0, 'Gypsy'), (0, 'Gypsy'), (60, 'Gypsy'),
                           (61, 'Gypsy'), (61, 'Gypsy'), (61, 'Gypsy'),
                           (76, 'Gypsy'), (78, 'Gypsy'), (122, 'Gypsy'),
                           (122, 'Gypsy'), (141, 'Gypsy'), (183, 'Gypsy'),
                           (251, 'Gypsy'), (260, 'Gypsy'), (260, 'Gypsy'),
                           (263, 'Gypsy'), (263, 'Gypsy'), (267, 'Gypsy'),
                           (267, 'Gypsy'), (288, 'Gypsy'), (288, 'Gypsy'),
                           (295, 'Gypsy'), (300, 'Gypsy'), (310, 'Gypsy'),
                           (310, 'Gypsy'), (317, 'Gypsy'), (317, 'Gypsy'),
                           (334, 'Gypsy'), (334, 'Gypsy'), (335, 'Gypsy'),
                           (338, 'Gypsy'), (338, 'Gypsy'), (338, 'Gypsy'),
                           (338, 'Gypsy'), (340, 'Gypsy'), (342, 'Gypsy'),
                           (342, 'Gypsy'), (344, 'Gypsy'), (344, 'Gypsy'),
                           (358, 'Gypsy'), (367, 'Gypsy'), (370, 'Gypsy'),
                           (370, 'Gypsy'), (377, 'Gypsy'), (387, 'Gypsy'),
                           (402, 'Gypsy'), (403, 'Gypsy'), (410, 'Gypsy'),
                           (410, 'Gypsy'), (410, 'Gypsy'), (418, 'Gypsy'),
                           (418, 'Gypsy'), (424, 'Gypsy'), (424, 'Gypsy'),
                           (577, 'Gypsy'), (857, 'Gypsy'), (879, 'Gypsy'),
                           (921, 'Gypsy'), (921, 'Gypsy'), (1007, 'Gypsy'),
                           (1031, 'Gypsy'), (1051, 'Gypsy'), (1051, 'Gypsy'),
                           (1059, 'Gypsy'), (1071, 'Gypsy'), (1071, 'Gypsy'),
                           (1080, 'Gypsy'), (1094, 'Gypsy'), (1094, 'Gypsy'),
                           (1110, 'Gypsy'), (1110, 'Gypsy'), (1113, 'Gypsy'),
                           (1113, 'Gypsy'), (1183, 'Gypsy'), (1189, 'Gypsy'),
                           (1200, 'Gypsy'), (1200, 'Gypsy'), (1217, 'Gypsy'),
                           (1234, 'Gypsy'), (1234, 'Gypsy'), (1591, 'Gypsy'),
                           (1620, 'Gypsy'), (1620, 'Gypsy'), (1662, 'Gypsy'),
                           (1686, 'Gypsy'), (1707, 'Gypsy'), (1755, 'Gypsy'),
                           (1828, 'Gypsy'), (1828, 'Gypsy'), (1848, 'Gypsy'),
                           (1848, 'Gypsy'), (1848, 'Gypsy'), (1848, 'Gypsy'),
                           (1851, 'Gypsy'), (1851, 'Gypsy'), (1852, 'Gypsy'),
                           (1917, 'Gypsy')],
                          dtype=np.dtype([('tip', np.int64),
                                          ('element', 'O')]))
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([(0, 577), (879, 1234), (1662, 1917)],
                           dtype=np.dtype([('start', np.int64),
                                           ('stop', np.int64)]))
    answer = loci.Contig(header, answer_loci)

    assert loci.clusters(query,
                         'tip',
                         10,
                         epsilon=200,
                         minimum_epsilon=10,
                         method='SDBICAN-aggressive') == answer
    def test_iter_values(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        answer = {('chr1', '+', 'gypsy', 1, 'gypsy1'),
                  ('chr1', '+', 'gypsy', 7, 'gypsy4'),
                  ('chr2', '+', 'gypsy', 3, 'gypsy7'),
                  ('chr2', '+', 'gypsy', 9, 'gypsy1')}

        assert set(query.iter_values()) == answer
def test_mutate_header():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    data = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                    dtype=loci_dtype)

    query_header = loci.Header(reference='chr1',
                               strand='-',
                               category=None,
                               source='bam')
    query = loci.Contig(query_header, data)

    answer_header = loci.Header(reference='chr1',
                                strand='.',
                                category='family',
                                source=None)
    answer = loci.Contig(answer_header, data)

    assert loci.mutate_header(query,
                              strand='.',
                              category='family',
                              source=None) == answer
    assert loci.mutate_header(
        query, strand='.', category='family', source=None) != query
def test_as_array():
    """Test conversion of nested loci data to flat array"""
    dtype_element_count = np.dtype([('name', 'O'), ('count', np.int64)])
    dtype_elements = np.dtype([(str(i), dtype_element_count)
                               for i in range(2)])
    dtype_sample_count = np.dtype([('name', 'O'), ('count', np.int64),
                                   ('element', dtype_elements)])
    dtype_samples = np.dtype([(str(i), dtype_sample_count) for i in range(2)])
    dtype_loci = np.dtype([('start', np.int64), ('stop', np.int64),
                           ('sample', dtype_samples)])

    # 3 element array with nested structured data
    data = np.array([(10, 15, (('bam1', 9, (('gypsy7', 5), ('gypsy3', 3))),
                               ('bam2', 8, (('gypsy7', 7), ('gypsy3', 1))))),
                     (21, 32, (('bam1', 7, (('gypsy3', 5), ('gypsy1', 2))),
                               ('bam2', 7, (('gypsy3', 7), (None, 0))))),
                     (43, 61, (('bam1', 5, (('gypsy9', 3), ('gypsy3', 2))),
                               ('bam2', 6, (('gypsy3', 5), ('gypsy9', 1)))))],
                    dtype=dtype_loci)

    header = loci.Header(reference='chr1', strand='-', category='gypsy')

    query = loci.Contig(header, data)

    # array dtype includes header fields
    answer_dtype = np.dtype([('reference', 'O'), ('strand', '<U1'),
                             ('category', 'O'), ('start', np.int64),
                             ('stop', np.int64), ('sample_0_name', 'O'),
                             ('sample_0_count', np.int64),
                             ('sample_0_element_0_name', 'O'),
                             ('sample_0_element_0_count', np.int64),
                             ('sample_0_element_1_name', 'O'),
                             ('sample_0_element_1_count', np.int64),
                             ('sample_1_name', 'O'),
                             ('sample_1_count', np.int64),
                             ('sample_1_element_0_name', 'O'),
                             ('sample_1_element_0_count', np.int64),
                             ('sample_1_element_1_name', 'O'),
                             ('sample_1_element_1_count', np.int64)])

    # 3 element array with flat structured data
    answer = np.array([('chr1', '-', 'gypsy', 10, 15, 'bam1', 9, 'gypsy7', 5,
                        'gypsy3', 3, 'bam2', 8, 'gypsy7', 7, 'gypsy3', 1),
                       ('chr1', '-', 'gypsy', 21, 32, 'bam1', 7, 'gypsy3', 5,
                        'gypsy1', 2, 'bam2', 7, 'gypsy3', 7, None, 0),
                       ('chr1', '-', 'gypsy', 43, 61, 'bam1', 5, 'gypsy9', 3,
                        'gypsy3', 2, 'bam2', 6, 'gypsy3', 5, 'gypsy9', 1)],
                      dtype=answer_dtype)

    npt.assert_array_equal(loci.as_array(query), answer)
def test_unions_buffered():
    header = loci.Header(reference='chr1', strand='-', source='bam')

    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.Contig(
        header,
        np.array([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25),
                  (16, 17), (19, 20)],
                 dtype=dtype))

    answer = loci.Contig(
        header, np.array([(-2, 9), (10, 12), (13, 14), (15, 30)], dtype=dtype))

    assert loci.unions_buffered(query, 5) == answer
def test_iter_values():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    data = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                    dtype=loci_dtype)

    query_header = loci.Header(reference='chr1',
                               strand='-',
                               category=None,
                               source='bam')
    query = loci.Contig(query_header, data)

    answer = [('chr1', '-', 'bam', 5, 'element1'),
              ('chr1', '-', 'bam', 1, 'element2'),
              ('chr1', '-', 'bam', 7, 'element3')]

    assert list(loci.iter_values(query)) == answer
def test_sort():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    header = loci.Header(reference='chr1', strand='-', category='family')

    query_loci = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3'),
                           (9, 'element4'), (6, 'element5'), (2, 'element6')],
                          dtype=loci_dtype)
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([(1, 'element2'), (2, 'element6'), (5, 'element1'),
                            (6, 'element5'), (7, 'element3'), (9, 'element4')],
                           dtype=loci_dtype)
    answer = loci.Contig(header, answer_loci)

    assert loci.sort(query, order='tip') == answer
    assert loci.sort(query, order='tip') != query
    def test_init_clashing_headers(self):
        """Contigs with same header should cause ValueError"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        try:
            loci.ContigSet(contig_1, contig_2)
        except ValueError:
            assert True
        else:
            assert False
def test_unions(query, answer):
    """
    Test includes following edge cases:
     * Long locus completely overlaps short loci:
        (15, 25) & (16, 17) & (19, 20) --> (15, 25)
     * Adjacent loci do not get merged:
        (7, 9) & (10, 12) -->  (*, 9) & (10, *)
     * Locus may span a single base:
        (13, 13) --> (13, 13)
    """
    header = loci.Header(reference='chr1', strand='-', source='bam')

    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.Contig(header, np.array(query, dtype=dtype))

    answer = loci.Contig(header, np.array(answer, dtype=dtype))

    assert loci.unions(query) == answer
def test_cluster_empty():
    header = loci.Header(reference='chr1',
                         strand='-',
                         category='Gypsy',
                         source='bam')
    query_loci = np.array([],
                          dtype=np.dtype([('tip', np.int64),
                                          ('element', 'O')]))
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([],
                           dtype=np.dtype([('start', np.int64),
                                           ('stop', np.int64)]))
    answer = loci.Contig(header, answer_loci)

    assert loci.clusters(query,
                         'tip',
                         10,
                         epsilon=200,
                         minimum_epsilon=10,
                         method='SDBICAN-aggressive') == answer
def test_extract_informative_read_tips():
    """
    Test extraction of informative reads.
    Not all families of reads extracted.
    Family with no reads ('NOT-A-FAMILY') extracted.
    """
    bam = DATA_PATH + 'testA-2017-06-08.bam'

    query = fingerprintio.extract_informative_read_tips(
        bam,
        'chr1', ['Gypsy', 'PIF-Harbinger', 'NOT-A-FAMILY'],
        quality=0,
        tag='ME')

    dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(2452, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2506, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2553, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2566, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr8_2502854'),
                      (2973, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3024, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3062, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3039, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3138, 'Gypsy_Gypsy26_chr18_27801424'),
                      (24065, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24184, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24195, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24217, 'Gypsy_Gypsy12_chr1_12715223')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(3217, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3226, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3246, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3405, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3646, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3776, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3779, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3800, 'Gypsy_Gypsy26_chr8_5114633'),
                      (24787, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24799, 'Gypsy_Gypsy29_chr11_13193899'),
                      (24850, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24854, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24857, 'Gypsy_Gypsy23_chr15_8310356'),
                      (24860, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24872, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24877, 'Gypsy_GYVIT1_chr6_13115950'),
                      (24894, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24895, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24910, 'Gypsy_Gypsy23_chr14_11656393'),
                      (24919, 'Gypsy_Gypsy23_chrUn_38723460')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21282, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21308, 'PIF-Harbinger_Harbinger-3_chr2_4407914'),
                      (21435, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21448, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21834, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21945, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21968, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21982, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)))

    assert query == answer
Exemple #25
0
def extract_informative_read_tips(bams,
                                  references,
                                  categories,
                                  quality=0,
                                  tag='ME'):
    """
    Extract the tips of 'informative' reads from one or more bam files.

    Informative reads are those that flank potential transposon
    insertions.
    The specific element (mate element) that each read is linked to
    should be stored using a sam tag which is 'ME' by default.
    Reads are categorised by transposon (super-)families by matching
    family names to the start of each reads mate-element name.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]
    :param quality: Minimum mapping quality of reads
    :type quality: int
    :param tag: Sam tag containing each reads mate element name
    :type tag: str

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    keys = product([ref.split(':')[0] for ref in references], ['+', '-'],
                   categories, [os.path.basename(bam) for bam in bams])
    dictionary = {loci.Header(*key): deque() for key in keys}

    for bam in bams:
        for reference in references:
            for read in _extract_bam_read_data(bam,
                                               reference,
                                               quality=quality,
                                               tags=[tag]):

                # match to a category
                category_matches = tuple(
                    filter(lambda x: read[tag].startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    # read header
                    header = loci.Header(reference=read['reference'],
                                         strand=read['strand'],
                                         category=category,
                                         source=read['source'])

                    # append loci data to que
                    tip = read['start'] if \
                        read['strand'] == '-' else \
                        read['stop']
                    dictionary[header].append((tip, read[tag]))

    dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))
Exemple #26
0
def extract_gff_intervals(gff, references, categories):
    """
    Extract known transposon intervals from a gff anotation
    file.

    :param gff: Path to a gff file of transposon anotations
    :type gff: str
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    source = os.path.basename(gff)
    references = [reference.split(':')[0] for reference in references]

    keys = product(references, categories)
    dictionary = {
        loci.Header(reference=key[0], category=key[1], source=source): deque()
        for key in keys
    }

    with zopen(gff, 'rb') as infile:
        for line in infile:
            line = line.decode().split('\t')

            # match to reference:
            reference = decode_column(line[0])
            if reference in references:

                # match to a category
                feature_type = decode_column(line[2])
                category_matches = tuple(
                    filter(lambda x: feature_type.startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    header = loci.Header(reference=reference,
                                         category=category,
                                         source=source)

                    dictionary[header].append(
                        (int(line[3]), int(line[4]), feature_type))

    dtype = np.dtype([('start', np.int64), ('stop', np.int64),
                      ('element', '<O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))
Exemple #27
0
def extract_anchor_intervals(bams,
                             references,
                             known_transposons,
                             insert_size,
                             quality=0):
    """
    Extract 'anchor' read inserts from one or more bam files

    Anchor reads are paired reads in which neither has been mapped
    to a known transposon.
    The pair has then been mapped to a reference genome.
    Assuming that the insert size of the pair is smaller than the
    length of a transposon, the insert can be used to indicate a
    section of the samples genome in which there are no transposons
    on at least one allele.
    This can be used to infere heterozygousity of transposons
    insertions.

    Known transposon inserts from the reference genome are required for
    checking that anchor inserts overlapping these transposon are of
    a sensible length.

    Anchor reads are compressed to their interval unions for efficiency.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param known_transposons: Transposons known from the reference genome
    :type known_transposons: :class:`loci2.ContigSet`
    :param insert_size: Read pair insert size
    :type insert_size: int
    :param quality: Minimum maping quality of anchor reads
    :type quality: int

    :return: A set of contigs of unions of anchor inserts categorised
        by reference, strand, and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    # simplify known transposon headers for comparison
    known_transposons = known_transposons.map(lambda x: loci.mutate_header(
        x, strand='.', category=None, source=None),
                                              append_duplicate_headers=True)

    jobs = product(bams, references)
    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])
    intervals = loci.ContigSet()

    for bam, reference in jobs:
        header = loci.Header(reference=reference.split(':')[0],
                             source=os.path.basename(bam),
                             strand='.')
        anchors = np.fromiter(_extract_bam_anchor_insert_data(bam,
                                                              reference,
                                                              quality=quality),
                              dtype=dtype)
        anchor_lengths = interval.lengths(anchors)

        # calculate lengths on known tranposons within each anchor interval
        reference_name = reference.split(':')[0]
        local_tes_header = loci.Header(reference=reference_name, strand='.')
        local_tes = known_transposons[local_tes_header]
        contained_te_lengths = interval.length_of_contains(
            anchors, local_tes.loci)

        # filter anchors based on insert size
        adjusted_anchor_lengths = anchor_lengths - contained_te_lengths
        anchors = anchors[adjusted_anchor_lengths <= insert_size]

        # use unions of filtered anchors as loci
        intervals.add(loci.unions(loci.Contig(header=header, loci=anchors)))

    return intervals