def setUp(self): iv = IntervalNode(Interval(50, 59)) for i in range(0, 110, 10): if i == 50: continue f = Interval(i, i + 9) iv = iv.insert(f) self.intervals = iv
def test_max_dist(self): iv = self.intervals r = iv.right(Interval(1, 1), max_dist=0, n=10) self.assertEqual(len(r), 0) for n, d in enumerate(range(10, 1000, 10)): r = iv.right(Interval(1, 1), max_dist=d, n=10000) self.assertEqual(len(r), n + 1)
def test_count(self): iv = self.intervals r = iv.right(Interval(1, 1), n=33) self.assertEqual(len(r), 33) l = iv.left(Interval(1, 1), n=33) self.assertEqual(len(l), 1)
def setUp(self): iv = IntervalNode(Interval(1, 2)) self.max = 1000000 for i in range(0, self.max, 10): f = Interval(i, i) iv = iv.insert(f) for i in range(6000): iv = iv.insert(Interval(0, 1)) self.intervals = iv
def test_left(self): iv = self.intervals self.assertEqual(str(iv.left(Interval(60, 70), n=2)), str([Interval(50, 59), Interval(40, 49)])) for i in range(10, 100, 10): f = Interval(i, i) r = iv.left(f, max_dist=10, n=1) self.assertEqual(r[0].end, i - 1)
def test_n(self): iv = self.intervals for i in range(0, 90, 10): f = Interval(i + 1, i + 1) r = iv.right(f, max_dist=20, n=2) self.assertEqual(r[0].start, i + 10) self.assertEqual(r[1].start, i + 20)
def test_right(self): iv = self.intervals self.assertEqual(str(iv.left(Interval(60, 70), n=2)), str([Interval(50, 59), Interval(40, 49)])) def get_right_start(b10): r = iv.right(Interval(b10, b10 + 1), n=1) assert len(r) == 1 return r[0].start for i in range(10, 100, 10): self.assertEqual(get_right_start(i), i + 10) for i in range(0, 100, 10): f = Interval(i - 1, i - 1) r = iv.right(f, max_dist=10, n=1) self.assertEqual(r[0].start, i)
def test_tree_pickle(self): a = IntervalTree() for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) a.insert(f) a.dump('a.pkl') b = IntervalTree() b.load('a.pkl') for ichr in range(5): for i in range(10, 100, 6): f = Interval(i - 4, i + 4) af = sorted(a.find(f), key=operator.attrgetter('start')) bf = sorted(b.find(f), key=operator.attrgetter('start')) assert len(bf) > 0 self.assertEqual(len(af), len(bf)) self.assertEqual(af[0].start, bf[0].start) self.assertEqual(af[-1].start, bf[-1].start)
def annotation(self, annotation_type, start=None, end=None) -> typing.List[Annotation]: try: if end is None or start is None: anno_iter = self._annotations.find(Interval(0, self.end)) else: anno_iter = filter( lambda x: x.data.overlaps(Span(start, end)), self._annotations.find(Interval(start, end))) except: return [] if annotation_type: annotation_type = annotation_type.lower() return sorted([ x.data for x in anno_iter if x.data.annotation_type.lower() == annotation_type and x.data != self ]) return sorted([x.data for x in anno_iter if x.data != self])
def parse_ribotricer_index(ribotricer_index): """ Parse ribotricer index to get only 'annotated' features. Parameters ---------- ribotricer_index: str Path to the index file generated by ribotricer prepare_orfs Returns ------- annotated: List[ORF] ORFs of CDS annotated novel: List[ORF] list of non-annotated ORFs refseq: defaultdict(IntervalTree) chrom: (start, end, strand) """ annotated = [] refseq = defaultdict(IntervalTree) # First count the number of # annotated regions to count. # The annotated regions appear first in the index file # so need to read only upto a point where the regions # no longer have the annotated tag. total_lines = 0 with open(ribotricer_index, "r") as anno: # read header anno.readline() while "annotated" in anno.readline(): total_lines += 1 with open(ribotricer_index, "r") as anno: with tqdm(total=total_lines, unit="lines", leave=False) as pbar: # read header anno.readline() line = anno.readline() while "annotated" in line: pbar.update() orf = ORF.from_string(line) if orf is not None and orf.category == "annotated": refseq[orf.chrom].insert( Interval( orf.intervals[0].start, orf.intervals[-1].end, STRAND_TO_NUM[orf.strand], )) annotated.append(orf) line = anno.readline() return (annotated, refseq)
def setUp(self): intervals = [] for i in range(11, 20000, 15): for zz in range(random.randint(2, 5)): m = random.randint(1, 10) p = random.randint(1, 10) intervals.append(Interval(i - m, i + p)) iv = IntervalNode(intervals[0]) for f in intervals[1:]: iv = iv.insert(f) self.intervals = intervals self.tree = iv
def create_annotation(self, type: str, start: int, end: int, attributes=None) -> Annotation: if attributes is None: attributes = [] annotation = Annotation(self, start, end, type, attributes, self._next_id) self._next_id += 1 self._annotations.insert( Interval(annotation.start, annotation.end, annotation)) self._aid_dict[annotation.annotation_id] = annotation return annotation
def setUp(self): tpath = os.path.dirname(__file__) self.fa = os.path.join(tpath, 'test.fa') self.fai = os.path.join(tpath, 'test.fa.fai') self.gff3_1 = os.path.join(tpath, 'test_1.gff3') self.gff3_2 = os.path.join(tpath, 'test_2.gff3') self.A = np.array([[1, 1, 0, 0], [1, 1, 1, 1], [0, 1, 1, 0]]) self.i5 = (Interval(0, 10), Interval(5, 15)) self.i0 = (Interval(0, 10), Interval(10, 15)) self.i3 = (Interval(0, 9), Interval(0, 3))
def test_left(self): max_dist = 200 n = 15 iv = self.tree for i in range(11, 20000, 25): for zz in range(random.randint(2, 5)): s1 = random.randint(i + 1, i + 20) f = Interval(s1, s1) bf = brute_force_find_left(self.intervals, f, max_dist, n) tf = iv.left(f, max_dist=max_dist, n=n) if len(tf) == 0: assert len(bf) == 0, bf continue mdist = max(distance(f, t) for t in tf) self.assertTrue(set(bf).issuperset(tf)) diff = set(bf).difference(tf) self.assertTrue(len(diff) == 0, (diff))
def infer_protocol(bam, gene_interval_tree, prefix, n_reads=20000): """Infer strandedness protocol given a bam file Parameters ---------- bam: str Path to bam file gene_interval_tree: defaultdict(IntervalTree) chrom: (start, end, strand) prefix: str Prefix for protocol file n_reads: int Number of reads to use (downsampled) Returns ------- protocol: string forward/reverse The strategy to do this is simple: keep a track of mapped reads and their strand and then tally if the location of their mapping has a gene defined on the positive strand or the negative strand. If the first and second characters denote the mapping and gene strand respectively: Higher proportion of (++, --) implies forward protocol Higher proportion of (+-, -+) implies reverse protocol Equal proportion of the above two scenairos implies unstranded protocol. """ iteration = 0 bam = pysam.AlignmentFile(bam, "rb") strandedness = Counter() for read in bam.fetch(until_eof=True): if iteration <= n_reads: if is_read_uniq_mapping(read): if read.is_reverse: mapped_strand = "-" else: mapped_strand = "+" mapped_start = read.reference_start mapped_end = read.reference_end chrom = read.reference_name # get corresponding gene's strand interval = list( set(gene_interval_tree[chrom].find( Interval(mapped_start, mapped_end)))) if len(interval) == 1: # Filter out genes with ambiguous strand info # (those) that have a tx_start on opposite strands gene_strand = NUM_TO_STRAND[interval[0].data] # count table for mapped strand vs gene strand strandedness["{}{}".format(mapped_strand, gene_strand)] += 1 iteration += 1 # Add pseudocounts strandedness["++"] += 1 strandedness["--"] += 1 strandedness["+-"] += 1 strandedness["-+"] += 1 total = sum(strandedness.values()) forward_mapped_reads = strandedness["++"] + strandedness["--"] reverse_mapped_reads = strandedness["-+"] + strandedness["+-"] to_write = ( "In total {} reads checked:\n" '\tNumber of reads explained by "++, --": {} ({:.4f})\n' '\tNumber of reads explained by "+-, -+": {} ({:.4f})\n').format( total, forward_mapped_reads, forward_mapped_reads / total, reverse_mapped_reads, reverse_mapped_reads / total, ) with open("{}_protocol.txt".format(prefix), "w") as output: output.write(to_write) protocol = "forward" if reverse_mapped_reads > forward_mapped_reads: protocol = "reverse" return protocol
def test_find(self): self.tree.find(Interval(46, 47))
def get_right_start(b10): r = iv.right(Interval(b10, b10 + 1), n=1) assert len(r) == 1 return r[0].start
def test_feature_pickle(self): f = Interval(22, 38, data={'a': 22}) g = loads(dumps(f)) self.assertEqual(f.start, g.start) self.assertEqual(g.data['a'], 22)
def test_left(self): self.tree.left(Interval(46, 47))
def test_toomany(self): iv = self.intervals self.assertEqual(len(iv.left(Interval(60, 70), n=200)), 6)
def test_right(self): self.tree.right(Interval(46, 47))
def test_left(self): self.assertEqual(2, len(self.tree4.left(Interval(44, 55)))) self.assertEqual(0, len(self.tree4.left(Interval(11, 12))))
def setUp(self): self.tree4 = IntervalTree() self.tree4.insert(Interval(22, 33, data='example1')) self.tree4.insert(Interval(22, 33, data='example2'))