def create_custom_tree(self): ivtree = IVTree() sentinel = tu.sentinel(ivtree) iv1 = GenomicInterval(5, 6) iv2 = GenomicInterval(3, 4) iv3 = GenomicInterval(7, 8) iv4 = GenomicInterval(1, 2) iv5 = GenomicInterval(4, 5) node1 = IVNode(iv1.start, iv1.end, True, iv1) node2 = IVNode(iv2.start, iv2.end, False, iv2) node3 = IVNode(iv3.start, iv3.end, True, iv3) node4 = IVNode(iv4.start, iv4.end, True, iv4) node5 = IVNode(iv5.start, iv5.end, True, iv5) nu.set_left(node1, node2) nu.set_right(node1, node3) nu.set_left(node2, node4) nu.set_right(node2, node5) nu.set_left(node4, sentinel) nu.set_right(node4, sentinel) nu.set_left(node5, sentinel) nu.set_right(node5, sentinel) nu.set_left(node3, sentinel) nu.set_right(node3, sentinel) nu.set_aug_end(node1, 8) nu.set_aug_end(node2, 5) nu.set_aug_end(node3, 8) nu.set_aug_end(node4, 2) nu.set_aug_end(node5, 5) tu.set_root(ivtree, node1) return ivtree
def test_overlap_crosstest(self): count, min_start, max_start, max_len = 250, 1, 10000, 20 step_size = 25 ivs = self.create_intervals(count, min_start, max_start, max_len) loop_iv_start = min_start - step_size - 1 tree = IVTree() for iv in ivs: tree.insert(iv) for i in range(loop_iv_start, max_start, step_size): test_iv = GenomicInterval(i, i + step_size) naive_ovl_count = 0 for iv in ivs: if test_iv.overlaps(iv): naive_ovl_count += 1 ovl_all = list(tree.query_all_overlaps(test_iv)) ovl_all_count = len(ovl_all) self.assertEqual(ovl_all_count, naive_ovl_count) ovl_iv = tree.query_overlap(test_iv) ovl = tree.has_overlap(test_iv) if ovl_all_count > 0: self.assertIsNotNone(ovl_iv) self.assertTrue(ovl) else: self.assertIsNone(ovl_iv) self.assertFalse(ovl)
def read_handle(cls, handle): container = cls() parser = PCTableParser(handle) for rec in parser.parse(): site = Interval(rec.position, rec.position) site._data = rec container._sites[rec.seqid].insert(site) return container
def test_range(self): tree = IVTree() self.assertIsNone(tree.get_range()) iv1 = GenomicInterval(2, 5) iv2 = GenomicInterval(7, 8) iv3 = GenomicInterval(7, 10) iv4 = GenomicInterval(4, 8) for iv in iv1, iv2, iv3, iv4: tree.insert(iv) self.assertEqual(tree.get_range(), (2, 10))
def test_iterate(self): iv1 = GenomicInterval(2, 5) iv2 = GenomicInterval(4, 6) iv3 = GenomicInterval(4, 10) iv4 = GenomicInterval(6, 8) ivs = [iv1, iv2, iv3, iv4] tree = IVTree() for iv in iv4, iv2, iv1, iv3: tree.insert(iv) for i, iv in enumerate(tree): self.assertIs(iv, ivs[i])
def test_node_indel(self): iv1 = GenomicInterval(1, 2) iv2 = GenomicInterval(1, 3) node = IVNode(iv1._int_start, iv1._int_end, True, iv1) self.assertFalse(nu.is_empty(node)) nu.insert(node, iv2) self.assertEqual(nu.max_end(node), iv2._int_end) self.assertTrue(nu.remove(node, iv2)) self.assertEqual(nu.max_end(node), iv1._int_end) self.assertFalse(nu.is_empty(node)) self.assertTrue(nu.remove(node, iv1)) self.assertTrue(nu.is_empty(node))
def __init__(self, records=[]): data_list = [] data_tree = defaultdict(IVTree) for record in records: iv = Interval(record.position, record.position) iv.data = record data_list.append(iv) data_tree[record.seqid].insert(iv) self._data_list = data_list self._data_tree = data_tree self._sort_keys = None self._descending = False
def create_intervals(self, iv_count, iv_start_min, iv_start_max, iv_len_max): ivs = [] for i in range(iv_count): iv_start = random.randrange(iv_start_min, iv_start_max) iv_end = iv_start + random.randrange(iv_len_max) ivs.append(GenomicInterval(iv_start, iv_end)) return ivs
def test_overlap_all(self): count, min_start, max_start, max_len = 200, 0, 500, 50 ovl_iv = GenomicInterval(50, 75) tree = IVTree() ovl_count = 0 for iv in self.create_intervals(count, min_start, max_start, max_len): tree.insert(iv) if iv.overlaps(ovl_iv): ovl_count += 1 tree_ovl_count = len(list(tree.query_all_overlaps(ovl_iv))) self.assertEqual(ovl_count, tree_ovl_count)
def test_node_overlap(self): iv1 = GenomicInterval(1, 5) iv2 = GenomicInterval(1, 10) node = IVNode(iv1._int_start, iv1._int_end, True, iv1) nu.insert(node, iv2) check_ivs = [ (GenomicInterval(-5, 0), 0), (GenomicInterval(-2, 2), 2), (GenomicInterval(2, 4), 2), (GenomicInterval(4, 6), 2), (GenomicInterval(6, 8), 1), (GenomicInterval(10, 12), 1), (GenomicInterval(12, 14), 0), ] for iv, exp_ovl in check_ivs: obs_ovl = len(list(node.overlap(iv._int_start, iv._int_end))) self.assertEqual(obs_ovl, exp_ovl)
def main(): parser = create_parser() args = parser.parse_args() gff_tree = defaultdict(IVTree) for gff_file in args.gff3_annot: with open(gff_file) as gff: parser = GFF3Parser(gff) for record in parser.parse(): iv = Interval(record.start, record.end) iv.rec = record gff_tree[record.seqid].insert(iv) table = [] occ_vector = [] with open(args.parclip_table) as in_table: pc_parser = PCTableParser(in_table) for rec in pc_parser.parse(): table.append(rec) occ_vector.append(-rec.occupancy) sort_vec = np.argsort(occ_vector) ranks = np.empty(len(occ_vector), int) ranks[sort_vec] = np.arange(len(occ_vector)) table_name = os.path.basename(args.parclip_table) annot_table = os.path.join(args.output_dir, table_name + '_annot') with open(annot_table, 'w') as out_table: print(*chain(pc_parser._fields, ['annotation']), sep='\t', file=out_table) hit_counter = Counter() ambig_hits = 0 unannotated = 0 for ind, rec in zip(ranks, table): if ind >= args.max_n: continue iv = Interval(rec.position, rec.position) hits = set() for gff_rec in gff_tree[rec.seqid].query_all_overlaps(iv): if rec.strand == gff_rec.rec.strand: hits.add(gff_rec.rec.type) if len(hits) > 1: ambig_hits += 1 elif len(hits) == 0: unannotated += 1 else: hit, = hits hit_counter[hit] += 1 if len(hits) == 0: hits.add('NA') print(*chain(list(rec), ['|'.join(hits)]), sep='\t', file=out_table) summary_file = os.path.join(args.output_dir, 'summary.tab') with open(summary_file, 'w') as sum_file: for annot, count in hit_counter.items(): print(annot, count, sep='\t', file=sum_file) print('unannotated', unannotated, sep='\t', file=sum_file) print('ambiguous', ambig_hits, sep='\t', file=sum_file)
def test_iv_one_based(self): GenomicInterval(1, 4, one_based=True) GenomicInterval(1, 1, one_based=True) with self.assertRaises(ValueError): GenomicInterval(2, 1, one_based=True)
def test_iv_overlap(self): iv1 = GenomicInterval(1, 3) iv2 = GenomicInterval(4, 5) iv3 = GenomicInterval(5, 6) iv4 = GenomicInterval(1, 2) iv5 = GenomicInterval(1, 1) iv6 = GenomicInterval(3, 3) self.assertFalse(iv1.overlaps(iv2)) self.assertTrue(iv2.overlaps(iv3)) self.assertFalse(iv4.overlaps(iv2)) self.assertTrue(iv2.overlaps(iv3)) self.assertTrue(iv5.overlaps(iv1)) self.assertTrue(iv6.overlaps(iv1))
def test_iv_zero_based(self): GenomicInterval(1, 4, one_based=False) GenomicInterval(1, 2, one_based=False) with self.assertRaises(ValueError): GenomicInterval(1, 1, one_based=False)