Beispiel #1
0
    def create_custom_tree(self):
        ivtree = IVTree()
        sentinel = tu.sentinel(ivtree)

        iv1 = GenomicInterval(5, 6)
        iv2 = GenomicInterval(3, 4)
        iv3 = GenomicInterval(7, 8)
        iv4 = GenomicInterval(1, 2)
        iv5 = GenomicInterval(4, 5)

        node1 = IVNode(iv1.start, iv1.end, True, iv1)
        node2 = IVNode(iv2.start, iv2.end, False, iv2)
        node3 = IVNode(iv3.start, iv3.end, True, iv3)
        node4 = IVNode(iv4.start, iv4.end, True, iv4)
        node5 = IVNode(iv5.start, iv5.end, True, iv5)

        nu.set_left(node1, node2)
        nu.set_right(node1, node3)
        nu.set_left(node2, node4)
        nu.set_right(node2, node5)
        nu.set_left(node4, sentinel)
        nu.set_right(node4, sentinel)
        nu.set_left(node5, sentinel)
        nu.set_right(node5, sentinel)
        nu.set_left(node3, sentinel)
        nu.set_right(node3, sentinel)

        nu.set_aug_end(node1, 8)
        nu.set_aug_end(node2, 5)
        nu.set_aug_end(node3, 8)
        nu.set_aug_end(node4, 2)
        nu.set_aug_end(node5, 5)

        tu.set_root(ivtree, node1)
        return ivtree
Beispiel #2
0
 def test_overlap_crosstest(self):
     count, min_start, max_start, max_len = 250, 1, 10000, 20
     step_size = 25
     ivs = self.create_intervals(count, min_start, max_start, max_len)
     loop_iv_start = min_start - step_size - 1
     tree = IVTree()
     for iv in ivs:
         tree.insert(iv)
     for i in range(loop_iv_start, max_start, step_size):
         test_iv = GenomicInterval(i, i + step_size)
         naive_ovl_count = 0
         for iv in ivs:
             if test_iv.overlaps(iv):
                 naive_ovl_count += 1
         ovl_all = list(tree.query_all_overlaps(test_iv))
         ovl_all_count = len(ovl_all)
         self.assertEqual(ovl_all_count, naive_ovl_count)
         ovl_iv = tree.query_overlap(test_iv)
         ovl = tree.has_overlap(test_iv)
         if ovl_all_count > 0:
             self.assertIsNotNone(ovl_iv)
             self.assertTrue(ovl)
         else:
             self.assertIsNone(ovl_iv)
             self.assertFalse(ovl)
Beispiel #3
0
 def read_handle(cls, handle):
     container = cls()
     parser = PCTableParser(handle)
     for rec in parser.parse():
         site = Interval(rec.position, rec.position)
         site._data = rec
         container._sites[rec.seqid].insert(site)
     return container
Beispiel #4
0
 def test_range(self):
     tree = IVTree()
     self.assertIsNone(tree.get_range())
     iv1 = GenomicInterval(2, 5)
     iv2 = GenomicInterval(7, 8)
     iv3 = GenomicInterval(7, 10)
     iv4 = GenomicInterval(4, 8)
     for iv in iv1, iv2, iv3, iv4:
         tree.insert(iv)
     self.assertEqual(tree.get_range(), (2, 10))
Beispiel #5
0
 def test_iterate(self):
     iv1 = GenomicInterval(2, 5)
     iv2 = GenomicInterval(4, 6)
     iv3 = GenomicInterval(4, 10)
     iv4 = GenomicInterval(6, 8)
     ivs = [iv1, iv2, iv3, iv4]
     tree = IVTree()
     for iv in iv4, iv2, iv1, iv3:
         tree.insert(iv)
     for i, iv in enumerate(tree):
         self.assertIs(iv, ivs[i])
Beispiel #6
0
 def test_node_indel(self):
     iv1 = GenomicInterval(1, 2)
     iv2 = GenomicInterval(1, 3)
     node = IVNode(iv1._int_start, iv1._int_end, True, iv1)
     self.assertFalse(nu.is_empty(node))
     nu.insert(node, iv2)
     self.assertEqual(nu.max_end(node), iv2._int_end)
     self.assertTrue(nu.remove(node, iv2))
     self.assertEqual(nu.max_end(node), iv1._int_end)
     self.assertFalse(nu.is_empty(node))
     self.assertTrue(nu.remove(node, iv1))
     self.assertTrue(nu.is_empty(node))
Beispiel #7
0
 def __init__(self, records=[]):
     data_list = []
     data_tree = defaultdict(IVTree)
     for record in records:
         iv = Interval(record.position, record.position)
         iv.data = record
         data_list.append(iv)
         data_tree[record.seqid].insert(iv)
     self._data_list = data_list
     self._data_tree = data_tree
     self._sort_keys = None
     self._descending = False
Beispiel #8
0
 def create_intervals(self, iv_count, iv_start_min, iv_start_max,
                      iv_len_max):
     ivs = []
     for i in range(iv_count):
         iv_start = random.randrange(iv_start_min, iv_start_max)
         iv_end = iv_start + random.randrange(iv_len_max)
         ivs.append(GenomicInterval(iv_start, iv_end))
     return ivs
Beispiel #9
0
 def test_overlap_all(self):
     count, min_start, max_start, max_len = 200, 0, 500, 50
     ovl_iv = GenomicInterval(50, 75)
     tree = IVTree()
     ovl_count = 0
     for iv in self.create_intervals(count, min_start, max_start, max_len):
         tree.insert(iv)
         if iv.overlaps(ovl_iv):
             ovl_count += 1
     tree_ovl_count = len(list(tree.query_all_overlaps(ovl_iv)))
     self.assertEqual(ovl_count, tree_ovl_count)
Beispiel #10
0
 def test_node_overlap(self):
     iv1 = GenomicInterval(1, 5)
     iv2 = GenomicInterval(1, 10)
     node = IVNode(iv1._int_start, iv1._int_end, True, iv1)
     nu.insert(node, iv2)
     check_ivs = [
         (GenomicInterval(-5, 0), 0),
         (GenomicInterval(-2, 2), 2),
         (GenomicInterval(2, 4), 2),
         (GenomicInterval(4, 6), 2),
         (GenomicInterval(6, 8), 1),
         (GenomicInterval(10, 12), 1),
         (GenomicInterval(12, 14), 0),
     ]
     for iv, exp_ovl in check_ivs:
         obs_ovl = len(list(node.overlap(iv._int_start, iv._int_end)))
         self.assertEqual(obs_ovl, exp_ovl)
Beispiel #11
0
def main():
    parser = create_parser()
    args = parser.parse_args()

    gff_tree = defaultdict(IVTree)
    for gff_file in args.gff3_annot:
        with open(gff_file) as gff:
            parser = GFF3Parser(gff)
            for record in parser.parse():
                iv = Interval(record.start, record.end)
                iv.rec = record
                gff_tree[record.seqid].insert(iv)

    table = []
    occ_vector = []
    with open(args.parclip_table) as in_table:
        pc_parser = PCTableParser(in_table)
        for rec in pc_parser.parse():
            table.append(rec)
            occ_vector.append(-rec.occupancy)
    sort_vec = np.argsort(occ_vector)
    ranks = np.empty(len(occ_vector), int)
    ranks[sort_vec] = np.arange(len(occ_vector))

    table_name = os.path.basename(args.parclip_table)
    annot_table = os.path.join(args.output_dir, table_name + '_annot')
    with open(annot_table, 'w') as out_table:
        print(*chain(pc_parser._fields, ['annotation']),
              sep='\t',
              file=out_table)
        hit_counter = Counter()
        ambig_hits = 0
        unannotated = 0
        for ind, rec in zip(ranks, table):
            if ind >= args.max_n:
                continue
            iv = Interval(rec.position, rec.position)
            hits = set()
            for gff_rec in gff_tree[rec.seqid].query_all_overlaps(iv):
                if rec.strand == gff_rec.rec.strand:
                    hits.add(gff_rec.rec.type)

            if len(hits) > 1:
                ambig_hits += 1
            elif len(hits) == 0:
                unannotated += 1
            else:
                hit, = hits
                hit_counter[hit] += 1

            if len(hits) == 0:
                hits.add('NA')
            print(*chain(list(rec), ['|'.join(hits)]),
                  sep='\t',
                  file=out_table)

    summary_file = os.path.join(args.output_dir, 'summary.tab')
    with open(summary_file, 'w') as sum_file:
        for annot, count in hit_counter.items():
            print(annot, count, sep='\t', file=sum_file)
        print('unannotated', unannotated, sep='\t', file=sum_file)
        print('ambiguous', ambig_hits, sep='\t', file=sum_file)
Beispiel #12
0
 def test_iv_one_based(self):
     GenomicInterval(1, 4, one_based=True)
     GenomicInterval(1, 1, one_based=True)
     with self.assertRaises(ValueError):
         GenomicInterval(2, 1, one_based=True)
Beispiel #13
0
 def test_iv_overlap(self):
     iv1 = GenomicInterval(1, 3)
     iv2 = GenomicInterval(4, 5)
     iv3 = GenomicInterval(5, 6)
     iv4 = GenomicInterval(1, 2)
     iv5 = GenomicInterval(1, 1)
     iv6 = GenomicInterval(3, 3)
     self.assertFalse(iv1.overlaps(iv2))
     self.assertTrue(iv2.overlaps(iv3))
     self.assertFalse(iv4.overlaps(iv2))
     self.assertTrue(iv2.overlaps(iv3))
     self.assertTrue(iv5.overlaps(iv1))
     self.assertTrue(iv6.overlaps(iv1))
Beispiel #14
0
 def test_iv_zero_based(self):
     GenomicInterval(1, 4, one_based=False)
     GenomicInterval(1, 2, one_based=False)
     with self.assertRaises(ValueError):
         GenomicInterval(1, 1, one_based=False)