Example #1
0
    def test_add_append_headers(self):
        """Contigs with same header should be appended"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1)
        query.add(contig_2, append_duplicate_headers=True)

        assert len(query) == 4
        assert len(list(query.contigs())) == 1
        assert len(query.headers()) == 1

        query_loci = list(query.contigs())[0].loci

        answer_loci = np.array([(1, 'gypsy1'), (7, 'gypsy4'), (3, 'gypsy7'),
                                (9, 'gypsy1')],
                               dtype=dtype_loci)

        npt.assert_array_equal(query_loci, answer_loci)
Example #2
0
def test_append():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    query_1 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=loci_dtype))

    query_2 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    answer = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3'),
                  (9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    assert loci.append(query_1, query_2) == answer
def test_create_contig_ids():
    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64),
                                 ('element', 'O')])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1'), (7, 9, 'gypsy4')],
                     dtype=dtype_loci_query)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7'), (9, 12, 'gypsy1')],
                     dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([('start', np.int64), ('stop', np.int64),
                                  ('element', 'O'), ('ID', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 5, 'gypsy1', 'gypsy_chr1_+_5'),
                      (7, 9, 'gypsy4', 'gypsy_chr1_+_9')],
                     dtype=dtype_loci_answer)),
        loci.Contig(
            loci.Header(reference='chr1', strand='-', category='gypsy'),
            np.array([(3, 8, 'gypsy7', 'gypsy_chr1_-_3'),
                      (9, 12, 'gypsy1', 'gypsy_chr1_-_9')],
                     dtype=dtype_loci_answer)))

    assert query.map(fingerprint.create_contig_ids) == answer
Example #4
0
def test_append_header_miss_match():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    query_1 = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=loci_dtype))

    query_2 = loci.Contig(
        loci.Header(
            reference='chr2',  # miss-matched
            strand='-',
            category='family',
            source='bam'),
        np.array([(9, 'element4'), (6, 'element5'), (2, 'element6')],
                 dtype=loci_dtype))

    try:
        loci.append(query_1, query_2)
    except ValueError:
        pass
    else:
        assert False
def test_extract_gff_intervals():
    gff = DATA_PATH + 'testAnnotation-2017-11-27.gff'

    query = fingerprintio.extract_gff_intervals(gff, 'chr1',
                                                ['Gypsy', 'Copia'])

    dtype_loci = np.dtype([('start', np.int64), ('stop', np.int64),
                           ('element', '<O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Gypsy',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(3150, 3200, 'Gypsy-21_ClassI;chr1:3150-3200'),
                      (24250, 24700, 'Gypsy-21_ClassI;chr1:24250-24700')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        category='Copia',
                        source='testAnnotation-2017-11-27.gff'),
            np.array([(98260, 98322, 'Copia-10_ClassI;chr1:98260-98322')],
                     dtype=dtype_loci)))

    assert query == answer
Example #6
0
    def test_map(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))
        contig_alt_1 = loci.Contig(
            header_1,
            np.array([(101, 'gypsy1'), (107, 'gypsy4')], dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))
        contig_alt_2 = loci.Contig(
            header_2,
            np.array([(103, 'gypsy7'), (109, 'gypsy1')], dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        def func(contig):
            """dummy function that adds 100 to contig loci 'tip's"""
            array = np.copy(contig.loci)
            array['tip'] += 100
            return loci.Contig(contig.header, array)

        query = query.map(func)

        answer = loci.ContigSet(contig_alt_1, contig_alt_2)

        assert query == answer
def test_count_reads_n2():
    dtype_loci_reads = np.dtype([('tip', np.int64), ('element', 'O')])

    reads = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam1'),
            np.array([(2, 'gypsy1'), (4, 'gypsy1'), (5, 'gypsy4'),
                      (7, 'gypsy4'), (7, 'gypsy7'), (7, 'gypsy1'),
                      (8, 'gypsy1'), (8, 'gypsy1')],
                     dtype=dtype_loci_reads)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='gypsy',
                        source='bam2'),
            np.array([(3, 'gypsy1'), (4, 'gypsy1'), (6, 'gypsy1'),
                      (7, 'gypsy1'), (7, 'gypsy1'), (7, 'gypsy1'),
                      (7, 'gypsy1'), (50, 'gypsy7')],
                     dtype=dtype_loci_reads)))

    dtype_loci_query = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(1, 15), (30, 60)], dtype=dtype_loci_query)))

    dtype_loci_answer = np.dtype([
        ('start', np.int64), ('stop', np.int64), ('median', np.int64),
        ('sample', [('0', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])]),
                    ('1', [('name', 'O'), ('count', np.int64),
                           ('element', [('0', [('name', 'O'),
                                               ('count', np.int64)]),
                                        ('1', [('name', 'O'),
                                               ('count', np.int64)])])])])
    ])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1', strand='+', category='gypsy'),
            np.array([(2, 8, 7, (('bam1', 8, (('gypsy1', 5), ('gypsy4', 2))),
                                 ('bam2', 7, (('gypsy1', 7), ('.', 0))))),
                      (50, 50, 50, (('bam1', 0, (('.', 0), ('.', 0))),
                                    ('bam2', 1, (('gypsy7', 1), ('.', 0)))))],
                     dtype=dtype_loci_answer)))

    assert fingerprint.count_reads(query,
                                   reads,
                                   trim=True,
                                   n_common_elements=2) == answer
Example #8
0
def test_cluster():
    header = loci.Header(reference='chr1',
                         strand='-',
                         category='Gypsy',
                         source='bam')
    query_loci = np.array([(0, 'Gypsy'), (0, 'Gypsy'), (60, 'Gypsy'),
                           (61, 'Gypsy'), (61, 'Gypsy'), (61, 'Gypsy'),
                           (76, 'Gypsy'), (78, 'Gypsy'), (122, 'Gypsy'),
                           (122, 'Gypsy'), (141, 'Gypsy'), (183, 'Gypsy'),
                           (251, 'Gypsy'), (260, 'Gypsy'), (260, 'Gypsy'),
                           (263, 'Gypsy'), (263, 'Gypsy'), (267, 'Gypsy'),
                           (267, 'Gypsy'), (288, 'Gypsy'), (288, 'Gypsy'),
                           (295, 'Gypsy'), (300, 'Gypsy'), (310, 'Gypsy'),
                           (310, 'Gypsy'), (317, 'Gypsy'), (317, 'Gypsy'),
                           (334, 'Gypsy'), (334, 'Gypsy'), (335, 'Gypsy'),
                           (338, 'Gypsy'), (338, 'Gypsy'), (338, 'Gypsy'),
                           (338, 'Gypsy'), (340, 'Gypsy'), (342, 'Gypsy'),
                           (342, 'Gypsy'), (344, 'Gypsy'), (344, 'Gypsy'),
                           (358, 'Gypsy'), (367, 'Gypsy'), (370, 'Gypsy'),
                           (370, 'Gypsy'), (377, 'Gypsy'), (387, 'Gypsy'),
                           (402, 'Gypsy'), (403, 'Gypsy'), (410, 'Gypsy'),
                           (410, 'Gypsy'), (410, 'Gypsy'), (418, 'Gypsy'),
                           (418, 'Gypsy'), (424, 'Gypsy'), (424, 'Gypsy'),
                           (577, 'Gypsy'), (857, 'Gypsy'), (879, 'Gypsy'),
                           (921, 'Gypsy'), (921, 'Gypsy'), (1007, 'Gypsy'),
                           (1031, 'Gypsy'), (1051, 'Gypsy'), (1051, 'Gypsy'),
                           (1059, 'Gypsy'), (1071, 'Gypsy'), (1071, 'Gypsy'),
                           (1080, 'Gypsy'), (1094, 'Gypsy'), (1094, 'Gypsy'),
                           (1110, 'Gypsy'), (1110, 'Gypsy'), (1113, 'Gypsy'),
                           (1113, 'Gypsy'), (1183, 'Gypsy'), (1189, 'Gypsy'),
                           (1200, 'Gypsy'), (1200, 'Gypsy'), (1217, 'Gypsy'),
                           (1234, 'Gypsy'), (1234, 'Gypsy'), (1591, 'Gypsy'),
                           (1620, 'Gypsy'), (1620, 'Gypsy'), (1662, 'Gypsy'),
                           (1686, 'Gypsy'), (1707, 'Gypsy'), (1755, 'Gypsy'),
                           (1828, 'Gypsy'), (1828, 'Gypsy'), (1848, 'Gypsy'),
                           (1848, 'Gypsy'), (1848, 'Gypsy'), (1848, 'Gypsy'),
                           (1851, 'Gypsy'), (1851, 'Gypsy'), (1852, 'Gypsy'),
                           (1917, 'Gypsy')],
                          dtype=np.dtype([('tip', np.int64),
                                          ('element', 'O')]))
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([(0, 577), (879, 1234), (1662, 1917)],
                           dtype=np.dtype([('start', np.int64),
                                           ('stop', np.int64)]))
    answer = loci.Contig(header, answer_loci)

    assert loci.clusters(query,
                         'tip',
                         10,
                         epsilon=200,
                         minimum_epsilon=10,
                         method='SDBICAN-aggressive') == answer
Example #9
0
def test_unions_buffered():
    header = loci.Header(reference='chr1', strand='-', source='bam')

    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.Contig(
        header,
        np.array([(3, 6), (6, 8), (7, 9), (10, 12), (13, 13), (15, 25),
                  (16, 17), (19, 20)],
                 dtype=dtype))

    answer = loci.Contig(
        header, np.array([(-2, 9), (10, 12), (13, 14), (15, 30)], dtype=dtype))

    assert loci.unions_buffered(query, 5) == answer
Example #10
0
def create_contig_ids(contig):
    """
    Create ids for contig based on category, reference, strand and position.

    :param contig: a collection of cluster loci (intervals)
    :type contig: :class:`loci.Contig`

    :return: a collection of cluster loci (intervals) with 'ID' field
    :rtype: :class:`loci.Contig`
    """
    template = '{0}_{1}_{2}_'.format(contig.header.category,
                                     contig.header.reference,
                                     contig.header.strand)
    if contig.header.strand == '+':
        position = 'stop'
    elif contig.header.strand == '-':
        position = 'start'
    else:
        assert False

    ids = [template + str(element[position]) for element in contig.loci]
    ids = np.array(ids)
    ids = np.array(ids, dtype=np.dtype([('ID', '<O')]))

    loci_data = util.numpy.array.bind(contig.loci, ids)

    return loci.Contig(contig.header, loci_data)
Example #11
0
def test_sort():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])

    header = loci.Header(reference='chr1', strand='-', category='family')

    query_loci = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3'),
                           (9, 'element4'), (6, 'element5'), (2, 'element6')],
                          dtype=loci_dtype)
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([(1, 'element2'), (2, 'element6'), (5, 'element1'),
                            (6, 'element5'), (7, 'element3'), (9, 'element4')],
                           dtype=loci_dtype)
    answer = loci.Contig(header, answer_loci)

    assert loci.sort(query, order='tip') == answer
    assert loci.sort(query, order='tip') != query
Example #12
0
def test_drop_field():
    query = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'),
        np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                 dtype=np.dtype([('tip', np.int64), ('element', 'O')])))

    answer = loci.Contig(
        loci.Header(reference='chr1',
                    strand='-',
                    category='family',
                    source='bam'), np.array([5, 1, 7]))
    answer.loci = np.array(answer.loci, np.dtype([('tip', np.int64)]))

    assert loci.drop_field(query, 'element') == answer
Example #13
0
    def test_headers(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = set(loci.ContigSet(contig_1, contig_2).headers())
        answer = {header_1, header_2}

        assert query == answer
Example #14
0
    def test_dtype_loci(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        assert query.dtype_loci() == contig_1.loci.dtype
        assert query.dtype_loci() == contig_1.loci.dtype
Example #15
0
    def test_init_clashing_headers(self):
        """Contigs with same header should cause ValueError"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header = loci.Header(reference='chr1', strand='+', category='gypsy')

        contig_1 = loci.Contig(
            header, np.array([(1, 'gypsy1'), (7, 'gypsy4')], dtype=dtype_loci))

        contig_2 = loci.Contig(
            header, np.array([(3, 'gypsy7'), (9, 'gypsy1')], dtype=dtype_loci))

        try:
            loci.ContigSet(contig_1, contig_2)
        except ValueError:
            assert True
        else:
            assert False
Example #16
0
def test_unions(query, answer):
    """
    Test includes following edge cases:
     * Long locus completely overlaps short loci:
        (15, 25) & (16, 17) & (19, 20) --> (15, 25)
     * Adjacent loci do not get merged:
        (7, 9) & (10, 12) -->  (*, 9) & (10, *)
     * Locus may span a single base:
        (13, 13) --> (13, 13)
    """
    header = loci.Header(reference='chr1', strand='-', source='bam')

    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])

    query = loci.Contig(header, np.array(query, dtype=dtype))

    answer = loci.Contig(header, np.array(answer, dtype=dtype))

    assert loci.unions(query) == answer
Example #17
0
    def test_init_different_headers(self):
        """"""
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        assert len(query) == 4
        assert len(list(query.contigs())) == 2
        assert len(query.headers()) == 2
Example #18
0
def match_known_insertions(clusters, known_insertions, distance=0):
    """
    Match clusters to known insertions annotated in the genome.

    Known insertions are represented as an object of
    :class:`loci.ContigSet` created from a gff file.
    Clusters are matched to a known insertion if they are for the
    same category and are within the specified distance of the
    insertions end.

    Fields required in 'clusters':
        'start': int, 'stop': int, 'median': int

    Fields required in 'known_insertions':
        'start': int, 'stop': int, 'element': str

    Fields appended to return value:
        'known_element': str

    :param clusters: a collection of cluster loci (intervals)
    :type clusters: :class:`loci.ContigSet`
    :param known_insertions: a collection of cluster loci (intervals)
    :type known_insertions: :class:`loci.ContigSet`
    :param distance: maximum distance for connecting a cluster to a
        known insertion
    :type distance: int

    :return: a collection of cluster loci (intervals) tagged with
        known insertions
    :rtype: :class:`loci.ContigSet`
    """
    matched = loci.ContigSet()

    # make known insertion headers un-stranded and drop origin file
    known_insertions = known_insertions.map(lambda x:
                                            loci.mutate_header(x, strand='.',
                                                               source=None))

    # loop through contigs
    for contig in clusters.contigs():

        # get relevant known insertions
        known = known_insertions[contig.header.mutate(strand='.')]

        matches = np.array(list(_known_insertion_matcher(contig,
                                                         known,
                                                         distance=distance)))
        matches = np.array(matches,
                           dtype=np.dtype([('known_element', '<O')]))

        matched.add(loci.Contig(contig.header,
                                util.numpy.array.bind(contig.loci,
                                                      matches)))

    return matched
Example #19
0
    def test_iter_values(self):
        dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

        header_1 = loci.Header(reference='chr1', strand='+', category='gypsy')
        contig_1 = loci.Contig(
            header_1, np.array([(1, 'gypsy1'), (7, 'gypsy4')],
                               dtype=dtype_loci))

        header_2 = loci.Header(reference='chr2', strand='+', category='gypsy')
        contig_2 = loci.Contig(
            header_2, np.array([(3, 'gypsy7'), (9, 'gypsy1')],
                               dtype=dtype_loci))

        query = loci.ContigSet(contig_1, contig_2)

        answer = {('chr1', '+', 'gypsy', 1, 'gypsy1'),
                  ('chr1', '+', 'gypsy', 7, 'gypsy4'),
                  ('chr2', '+', 'gypsy', 3, 'gypsy7'),
                  ('chr2', '+', 'gypsy', 9, 'gypsy1')}

        assert set(query.iter_values()) == answer
Example #20
0
def test_cluster_empty():
    header = loci.Header(reference='chr1',
                         strand='-',
                         category='Gypsy',
                         source='bam')
    query_loci = np.array([],
                          dtype=np.dtype([('tip', np.int64),
                                          ('element', 'O')]))
    query = loci.Contig(header, query_loci)

    answer_loci = np.array([],
                           dtype=np.dtype([('start', np.int64),
                                           ('stop', np.int64)]))
    answer = loci.Contig(header, answer_loci)

    assert loci.clusters(query,
                         'tip',
                         10,
                         epsilon=200,
                         minimum_epsilon=10,
                         method='SDBICAN-aggressive') == answer
Example #21
0
def test_as_array():
    """Test conversion of nested loci data to flat array"""
    dtype_element_count = np.dtype([('name', 'O'), ('count', np.int64)])
    dtype_elements = np.dtype([(str(i), dtype_element_count)
                               for i in range(2)])
    dtype_sample_count = np.dtype([('name', 'O'), ('count', np.int64),
                                   ('element', dtype_elements)])
    dtype_samples = np.dtype([(str(i), dtype_sample_count) for i in range(2)])
    dtype_loci = np.dtype([('start', np.int64), ('stop', np.int64),
                           ('sample', dtype_samples)])

    # 3 element array with nested structured data
    data = np.array([(10, 15, (('bam1', 9, (('gypsy7', 5), ('gypsy3', 3))),
                               ('bam2', 8, (('gypsy7', 7), ('gypsy3', 1))))),
                     (21, 32, (('bam1', 7, (('gypsy3', 5), ('gypsy1', 2))),
                               ('bam2', 7, (('gypsy3', 7), (None, 0))))),
                     (43, 61, (('bam1', 5, (('gypsy9', 3), ('gypsy3', 2))),
                               ('bam2', 6, (('gypsy3', 5), ('gypsy9', 1)))))],
                    dtype=dtype_loci)

    header = loci.Header(reference='chr1', strand='-', category='gypsy')

    query = loci.Contig(header, data)

    # array dtype includes header fields
    answer_dtype = np.dtype([('reference', 'O'), ('strand', '<U1'),
                             ('category', 'O'), ('start', np.int64),
                             ('stop', np.int64), ('sample_0_name', 'O'),
                             ('sample_0_count', np.int64),
                             ('sample_0_element_0_name', 'O'),
                             ('sample_0_element_0_count', np.int64),
                             ('sample_0_element_1_name', 'O'),
                             ('sample_0_element_1_count', np.int64),
                             ('sample_1_name', 'O'),
                             ('sample_1_count', np.int64),
                             ('sample_1_element_0_name', 'O'),
                             ('sample_1_element_0_count', np.int64),
                             ('sample_1_element_1_name', 'O'),
                             ('sample_1_element_1_count', np.int64)])

    # 3 element array with flat structured data
    answer = np.array([('chr1', '-', 'gypsy', 10, 15, 'bam1', 9, 'gypsy7', 5,
                        'gypsy3', 3, 'bam2', 8, 'gypsy7', 7, 'gypsy3', 1),
                       ('chr1', '-', 'gypsy', 21, 32, 'bam1', 7, 'gypsy3', 5,
                        'gypsy1', 2, 'bam2', 7, 'gypsy3', 7, None, 0),
                       ('chr1', '-', 'gypsy', 43, 61, 'bam1', 5, 'gypsy9', 3,
                        'gypsy3', 2, 'bam2', 6, 'gypsy3', 5, 'gypsy9', 1)],
                      dtype=answer_dtype)

    npt.assert_array_equal(loci.as_array(query), answer)
Example #22
0
def test_mutate_header():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    data = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                    dtype=loci_dtype)

    query_header = loci.Header(reference='chr1',
                               strand='-',
                               category=None,
                               source='bam')
    query = loci.Contig(query_header, data)

    answer_header = loci.Header(reference='chr1',
                                strand='.',
                                category='family',
                                source=None)
    answer = loci.Contig(answer_header, data)

    assert loci.mutate_header(query,
                              strand='.',
                              category='family',
                              source=None) == answer
    assert loci.mutate_header(
        query, strand='.', category='family', source=None) != query
Example #23
0
def test_iter_values():
    loci_dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    data = np.array([(5, 'element1'), (1, 'element2'), (7, 'element3')],
                    dtype=loci_dtype)

    query_header = loci.Header(reference='chr1',
                               strand='-',
                               category=None,
                               source='bam')
    query = loci.Contig(query_header, data)

    answer = [('chr1', '-', 'bam', 5, 'element1'),
              ('chr1', '-', 'bam', 1, 'element2'),
              ('chr1', '-', 'bam', 7, 'element3')]

    assert list(loci.iter_values(query)) == answer
Example #24
0
def pair_clusters(clusters, distance=0, use_known_elements=True):
    """
    Join matching clusters on opposite strands.

    Clusters of the same calgary are joined if they are within
    2 * distance of one another.
    Clusters may also be joined if they have both been matched
    to the same known element.

    Fields required in 'clusters':
        'start': int, 'stop': int, 'median': int,
        'known_element': str, 'ID': str

    Fields appended to return value:
        'pair' str


    :param clusters: a collection of cluster loci (intervals)
    :type clusters: :class:`loci.ContigSet`
    :param distance: the distance to search out from each cluster
    :type distance: int
    :param use_known_elements: specify whether to join pairs based on a
        common known element (default: True)
    :type use_known_elements: bool

    :return: a collection of cluster loci (intervals) with 'pair' field
    :rtype: :class:`loci.ContigSet`
    """
    joint_clusters = loci.ContigSet()

    dtype_join_data = np.dtype([("pair", "<O")])

    # new headers based on old but un-stranded
    new_headers = {h.mutate(strand='.') for h in clusters.headers()}

    for header in new_headers:
        # get forward and reverse loci for this key
        forward = clusters[header.mutate(strand='+')]
        reverse = clusters[header.mutate(strand='-')]

        # sort them into pairs based on median
        pairs = _cluster_pairer(forward,
                                reverse,
                                distance=distance,
                                use_known_elements=use_known_elements)

        # create arrays for the new data
        forward_join_data = np.empty(len(forward), dtype=dtype_join_data)
        forward_join_data["pair"] = '.'
        reverse_join_data = np.empty(len(reverse), dtype=dtype_join_data)
        reverse_join_data["pair"] = '.'
        for f, r in pairs:
            if f is not None and r is not None:
                forward_join_data[f]["pair"] = reverse.loci[r]["ID"]
                reverse_join_data[r]["pair"] = forward.loci[f]["ID"]
            else:
                pass

        # combine existing data with join data and add to new contig set
        joint_clusters.add(loci.Contig(header.mutate(strand='+'),
                                       util.numpy.array.bind(forward.loci,
                                                             forward_join_data)))
        joint_clusters.add(loci.Contig(header.mutate(strand='-'),
                                       util.numpy.array.bind(reverse.loci,
                                                             reverse_join_data)))

    return joint_clusters
Example #25
0
 def func(contig):
     """dummy function that adds 100 to contig loci 'tip's"""
     array = np.copy(contig.loci)
     array['tip'] += 100
     return loci.Contig(contig.header, array)
Example #26
0
def extract_anchor_intervals(bams,
                             references,
                             known_transposons,
                             insert_size,
                             quality=0):
    """
    Extract 'anchor' read inserts from one or more bam files

    Anchor reads are paired reads in which neither has been mapped
    to a known transposon.
    The pair has then been mapped to a reference genome.
    Assuming that the insert size of the pair is smaller than the
    length of a transposon, the insert can be used to indicate a
    section of the samples genome in which there are no transposons
    on at least one allele.
    This can be used to infere heterozygousity of transposons
    insertions.

    Known transposon inserts from the reference genome are required for
    checking that anchor inserts overlapping these transposon are of
    a sensible length.

    Anchor reads are compressed to their interval unions for efficiency.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param known_transposons: Transposons known from the reference genome
    :type known_transposons: :class:`loci2.ContigSet`
    :param insert_size: Read pair insert size
    :type insert_size: int
    :param quality: Minimum maping quality of anchor reads
    :type quality: int

    :return: A set of contigs of unions of anchor inserts categorised
        by reference, strand, and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    # simplify known transposon headers for comparison
    known_transposons = known_transposons.map(lambda x: loci.mutate_header(
        x, strand='.', category=None, source=None),
                                              append_duplicate_headers=True)

    jobs = product(bams, references)
    dtype = np.dtype([('start', np.int64), ('stop', np.int64)])
    intervals = loci.ContigSet()

    for bam, reference in jobs:
        header = loci.Header(reference=reference.split(':')[0],
                             source=os.path.basename(bam),
                             strand='.')
        anchors = np.fromiter(_extract_bam_anchor_insert_data(bam,
                                                              reference,
                                                              quality=quality),
                              dtype=dtype)
        anchor_lengths = interval.lengths(anchors)

        # calculate lengths on known tranposons within each anchor interval
        reference_name = reference.split(':')[0]
        local_tes_header = loci.Header(reference=reference_name, strand='.')
        local_tes = known_transposons[local_tes_header]
        contained_te_lengths = interval.length_of_contains(
            anchors, local_tes.loci)

        # filter anchors based on insert size
        adjusted_anchor_lengths = anchor_lengths - contained_te_lengths
        anchors = anchors[adjusted_anchor_lengths <= insert_size]

        # use unions of filtered anchors as loci
        intervals.add(loci.unions(loci.Contig(header=header, loci=anchors)))

    return intervals
def test_extract_informative_read_tips():
    """
    Test extraction of informative reads.
    Not all families of reads extracted.
    Family with no reads ('NOT-A-FAMILY') extracted.
    """
    bam = DATA_PATH + 'testA-2017-06-08.bam'

    query = fingerprintio.extract_informative_read_tips(
        bam,
        'chr1', ['Gypsy', 'PIF-Harbinger', 'NOT-A-FAMILY'],
        quality=0,
        tag='ME')

    dtype_loci = np.dtype([('tip', np.int64), ('element', 'O')])

    answer = loci.ContigSet(
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(2452, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2506, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2553, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2566, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2577, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr15_18793972'),
                      (2841, 'Gypsy_Gypsy26_chr8_2502854'),
                      (2973, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3024, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3062, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3039, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3138, 'Gypsy_Gypsy26_chr18_27801424'),
                      (24065, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24184, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24195, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24217, 'Gypsy_Gypsy12_chr1_12715223')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='Gypsy',
                        source='testA-2017-06-08.bam'),
            np.array([(3217, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3226, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3246, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3405, 'Gypsy_Gypsy26_chr2_1987286'),
                      (3646, 'Gypsy_Gypsy26_chr15_18793972'),
                      (3776, 'Gypsy_Gypsy26_chr18_27801424'),
                      (3779, 'Gypsy_Gypsy26_chr8_5114633'),
                      (3800, 'Gypsy_Gypsy26_chr8_5114633'),
                      (24787, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24799, 'Gypsy_Gypsy29_chr11_13193899'),
                      (24850, 'Gypsy_Gypsy7_chr4_10302390'),
                      (24854, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24857, 'Gypsy_Gypsy23_chr15_8310356'),
                      (24860, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24872, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24877, 'Gypsy_GYVIT1_chr6_13115950'),
                      (24894, 'Gypsy_Gypsy23_chrUn_38723460'),
                      (24895, 'Gypsy_Gypsy12_chr1_12715223'),
                      (24910, 'Gypsy_Gypsy23_chr14_11656393'),
                      (24919, 'Gypsy_Gypsy23_chrUn_38723460')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21282, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21308, 'PIF-Harbinger_Harbinger-3_chr2_4407914'),
                      (21435, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21448, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='PIF-Harbinger',
                        source='testA-2017-06-08.bam'),
            np.array([(21834, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21945, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21968, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579'),
                      (21982, 'PIF-Harbinger_Harbinger-3N3_chr16_20723579')],
                     dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='+',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)),
        loci.Contig(
            loci.Header(reference='chr1',
                        strand='-',
                        category='NOT-A-FAMILY',
                        source='testA-2017-06-08.bam'),
            np.array([], dtype=dtype_loci)))

    assert query == answer
Example #28
0
def extract_informative_read_tips(bams,
                                  references,
                                  categories,
                                  quality=0,
                                  tag='ME'):
    """
    Extract the tips of 'informative' reads from one or more bam files.

    Informative reads are those that flank potential transposon
    insertions.
    The specific element (mate element) that each read is linked to
    should be stored using a sam tag which is 'ME' by default.
    Reads are categorised by transposon (super-)families by matching
    family names to the start of each reads mate-element name.

    :param bams: Path(s) to one or more bam files
    :type bams: str | list[str]
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]
    :param quality: Minimum mapping quality of reads
    :type quality: int
    :param tag: Sam tag containing each reads mate element name
    :type tag: str

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(bams, str):
        bams = [bams]

    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    keys = product([ref.split(':')[0] for ref in references], ['+', '-'],
                   categories, [os.path.basename(bam) for bam in bams])
    dictionary = {loci.Header(*key): deque() for key in keys}

    for bam in bams:
        for reference in references:
            for read in _extract_bam_read_data(bam,
                                               reference,
                                               quality=quality,
                                               tags=[tag]):

                # match to a category
                category_matches = tuple(
                    filter(lambda x: read[tag].startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    # read header
                    header = loci.Header(reference=read['reference'],
                                         strand=read['strand'],
                                         category=category,
                                         source=read['source'])

                    # append loci data to que
                    tip = read['start'] if \
                        read['strand'] == '-' else \
                        read['stop']
                    dictionary[header].append((tip, read[tag]))

    dtype = np.dtype([('tip', np.int64), ('element', 'O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))
Example #29
0
def extract_gff_intervals(gff, references, categories):
    """
    Extract known transposon intervals from a gff anotation
    file.

    :param gff: Path to a gff file of transposon anotations
    :type gff: str
    :param references: Name(s) of one or more bam references
    :type references: str | list[str]
    :param categories: Name(s) of one or more transposon (super-)families
    :type categories: str | list[str]

    :return: A set of contigs of read tips categorised by reference,
        strand, category (family), and source (bam file name)
    :rtype: :class:`loci2.ContigSet`
    """
    if isinstance(references, str):
        references = [references]

    if isinstance(categories, str):
        categories = [categories]

    source = os.path.basename(gff)
    references = [reference.split(':')[0] for reference in references]

    keys = product(references, categories)
    dictionary = {
        loci.Header(reference=key[0], category=key[1], source=source): deque()
        for key in keys
    }

    with zopen(gff, 'rb') as infile:
        for line in infile:
            line = line.decode().split('\t')

            # match to reference:
            reference = decode_column(line[0])
            if reference in references:

                # match to a category
                feature_type = decode_column(line[2])
                category_matches = tuple(
                    filter(lambda x: feature_type.startswith(x), categories))

                # only include reads for specified categories
                if category_matches:

                    # longest matching category is the best category
                    category = max(category_matches, key=len)

                    header = loci.Header(reference=reference,
                                         category=category,
                                         source=source)

                    dictionary[header].append(
                        (int(line[3]), int(line[4]), feature_type))

    dtype = np.dtype([('start', np.int64), ('stop', np.int64),
                      ('element', '<O')])
    return loci.ContigSet(*(loci.Contig(header, np.array(data, dtype=dtype))
                            for header, data in dictionary.items()))