def make_exons_from_base_mapping(mapping, start, end, strand): """ mapping is 0-based index on transcript --> 0-based index on genome however beware of strand! """ output = [mapping[start]] for i in xrange(start+1, end): cur_pos, cur_is_junction= mapping[i] if cur_is_junction and mapping[i]!=output[-1]: # if the last position is the same, DON'T APPEND (was an indel) output.append(mapping[i]) cur_pos, cur_is_junction = mapping[end] if mapping[end]!=output[-1]: output.append(mapping[end]) # remember for Interval it is 0-based start, 1-based end # if len(output) is odd, must be 1bp into an exon # ex: [(xxx,True), (xxx,True), (xxx,False)] or # [.....(xxx,True), xxx(True)] #print output if len(output)==1: output = [output[0], output[0]] # just duplicate it elif len(output)%2==1: if output[0][1] and output[1][1]: output.insert(0, output[0]) elif output[-1][1] and output[-2][1]: output.append(output[-1]) # print "modified:", output if strand == '+': return [Interval(output[i][0],output[i+1][0]+1) for i in xrange(0, len(output), 2)] else: # - strand return [Interval(output[i][0],output[i-1][0]+1) for i in xrange(len(output)-1,-1,-2)]
def get_exon_coordinates(exons, start, end): """ Return the set of "exons" (genome location) that is where the nucleotide start-end is start is 0-based end is 1-based exons is a set of Interval (0-based start, 1-based end) """ acc_lens = [0] # ex: [0, 945, 1065, 1141, 1237] accumulative length of exons len_of_transcript = 0 for e in exons: _len = e.end - e.start acc_lens.append(acc_lens[-1] + _len) len_of_transcript += _len # confirm that start-end is in the range of the transcript! assert ( 0 <= start < end <= len_of_transcript + 30 ) # allow a 30-bp slack due to PacBio indels end = min(end, len_of_transcript) # trim it to the end if necessary (for PacBio) i = bisect.bisect_right(acc_lens, start) j = bisect.bisect_right(acc_lens, end) # starts at i-th exon and ends at j-th exon, i and j are both 1-based # for the first exon, the offset is start-acc+e.start # for the last exon, the end point is end-acc+e.start if i == j: return [ Interval( start - acc_lens[i - 1] + exons[i - 1].start, end - acc_lens[i - 1] + exons[i - 1].start, ) ] else: if j >= len(exons): # the end is the end return [ Interval(start - acc_lens[i - 1] + exons[i - 1].start, exons[i - 1].end) ] + exons[i:] else: return ( [ Interval( start - acc_lens[i - 1] + exons[i - 1].start, exons[i - 1].end ) ] + exons[i : (j - 1)] + [ Interval( exons[j - 1].start, end - acc_lens[j - 1] + exons[j - 1].start ) ] )
def read_scrubbed_junction_to_tree(junction_filename): tree = defaultdict(lambda: IntervalTree()) for line in open(junction_filename): chrom, left, right, strand = line.strip().split('\t') left, right = int(left), int(right) # already 0-based start, 0-based end tree[chrom,strand].add(left, right, Interval(left, right)) return tree
def scrub_ref_exons(r, tree): n = len(r.ref_exons) new_ref_exons = [] cur_start = r.ref_exons[0].start for i in range(n - 1): donor = r.ref_exons[i].end - 1 # make it 0-based accep = r.ref_exons[i + 1].start # start is already 0-based match = find_best_match_junction(tree[r.chr, r.strand], donor, accep) if match is None: print("donor-acceptor site {0},{1},{2}-{3} has no hit in tree!".format(\ r.chr, r.strand, donor, accep), file=sys.stderr) return None new_ref_exons.append(Interval(cur_start, match.start + 1)) cur_start = match.end new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end)) return new_ref_exons
def scrub_ref_exons(r: Dict[str, Any], tree: IntervalTree) -> Optional[List[Interval]]: n = len(r.ref_exons) new_ref_exons = [] cur_start = r.ref_exons[0].start for i in range(n - 1): donor = r.ref_exons[i].end - 1 # make it 0-based accep = r.ref_exons[i + 1].start # start is already 0-based match = find_best_match_junction(tree[r.chr, r.strand], donor, accep) if match is None: logger.info( f"donor-acceptor site {r.chr},{r.strand},{donor}-{accep} has no hit in tree!" ) return None new_ref_exons.append(Interval(cur_start, match.start + 1)) cur_start = match.end new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end)) return new_ref_exons
def scrub_junctions(report_filename, output_filename, min_sample, min_transcript, accept_all_canonical): tree = defaultdict(lambda: IntervalTree()) f = open(output_filename, 'w') for _label, junctions in read_junction_report(report_filename): good = scrub_junction_by_label(junctions, min_sample, min_transcript, accept_all_canonical) for r in good: a, b = int(r['left']), int(r['right']) # 0-based start, 0-basde end f.write("{chrom}\t{left}\t{right}\t{strand}\n".format(\ chrom=r['chr'], left=r['left'], right=r['right'], strand=r['strand'])) tree[r['chr'],r['strand']].add(a, b, Interval(a, b)) f.close() return tree
def read_scrubbed_junction_to_tree(junction_filename): tree = defaultdict(lambda: IntervalTree()) f = open(junction_filename) if not f.readline().startswith('track'): f.seek(0) for line in f: raw = line.strip().split('\t') if len(raw) == 4: chrom, left, right, strand = raw elif len(raw) == 6: chrom, left, right, _name, _count, strand = raw else: raise Exception, "Expects junction BED file to have either 4 or 6 columns! Saw {0}!".format( len(raw)) left, right = int(left), int( right) # already 0-based start, 0-based end tree[chrom, strand].add(left, right, Interval(left, right)) return tree
def count_repeats_for_motif(seq, motif, tally, intervals=None): """ seq --- plain sequence to search for the repeats (motifs) motif --- plain sequence of repeat, ex: CGG, AGG intervals --- 0-based start, 1-based end of Intervals to search motif in """ if intervals is None: # use the whole sequence intervals = [Interval(0, len(seq))] new_intl = [] for intl in intervals: cur = seq[intl.start:intl.end] prev_end = intl.start found_flag = False for m in re.finditer(motif, cur): tally[motif].append(intl.start + m.start()) if m.start() > prev_end: # new interval is prev_end (0-based), m.start() (1-based) new_intl.append(Interval(prev_end, intl.start + m.start())) prev_end = intl.start + m.end() found_flag = True if not found_flag: new_intl.append(intl) return new_intl
def read_probe_bed(bed_filename, start_base=0, end_base=1): """ Read a probe BED file <chrom>, <start>, <end> Return dict of chrom --> IntervalTree w/ data=(index, interval) """ tree = {} gene_info = {} i = 0 reader = BED.SimpleBEDReader(bed_filename, start_base, end_base) for r in reader: if r.chr not in tree: tree[r.chr] = IntervalTree() tree[r.chr].add(r.start, r.end, (i, Interval(r.start, r.end))) if r.name is not None: gene_info[i] = r.name i += 1 return tree, gene_info
def scrub_junctions( report_filename: Union[str, Path], output_filename: Union[str, Path], min_sample: int, min_transcript: int, accept_all_canonical: bool, ) -> IntervalTree: tree = defaultdict(IntervalTree) with open(output_filename, "w") as f: for _, junctions in read_junction_report(report_filename): good = scrub_junction_by_label(junctions, min_sample, min_transcript, accept_all_canonical) for r in good: a, b = int(r["left"]), int( r["right"]) # 0-based start, 0-basde end f.write( f"{r['chr']}\t{r['left']}\t{r['right']}\t{r['strand']}\n") tree[r["chr"], r["strand"]].add(a, b, Interval(a, b)) return tree
def read_scrubbed_junction_to_tree( junction_filename: Union[str, Path]) -> IntervalTree: tree = defaultdict(IntervalTree) with open(junction_filename) as f: if not f.readline().startswith("track"): f.seek(0) for line in f: raw = line.strip().split("\t") if len(raw) == 4: chrom, left, right, strand = raw elif len(raw) == 6: chrom, left, right, _name, _count, strand = raw else: raise Exception( f"Expects junction BED file to have either 4 or 6 columns! Saw {len(raw)}!" ) left, right = int(left), int( right) # already 0-based start, 0-based end tree[chrom, strand].add(left, right, Interval(left, right)) return tree
def __init__(self,chrom,start,end,value=None,strand=None): Interval.__init__(self,start,end,value) self.chrom=chrom self.strand=strand
def __init__(self, start, stop, genome=None, **kws): self.genome = genome if 'strand' in kws: kws['strand'] = _convert_strand(kws['strand']) BaseInterval.__init__(self, start, stop, **kws)
For each interval in `bed1` count the number of intersecting regions in `bed2`. usage: %prog bed1 bed2 """ from __future__ import print_function import sys from bx.intervals import (Intersecter, Interval) bed1, bed2 = sys.argv[1:3] ranges = {} for line in open(bed2): fields = line.strip().split() chrom, start, end, = fields[0], int(fields[1]), int(fields[2]) if chrom not in ranges: ranges[chrom] = Intersecter() ranges[chrom].add_interval(Interval(start, end)) for line in open(bed1): fields = line.strip().split() chrom, start, end = fields[0], int(fields[1]), int(fields[2]) other = " ".join(fields[3:]) out = " ".join(fields[:3] + [other]) if chrom in ranges: print(out, len(ranges[chrom].find(start, end))) else: print(out, 0)
def calc_indels_from_sam(samFile): """ Given an aligned SAM file, calculate indel statistics. :param samFile: aligned SAM file :return: indelsJunc (dict of pbid --> list of junctions near indel), indelsTotal (dict of pbid --> total indels count) """ sam = pysam.AlignmentFile(samFile, "r") out_file = samFile[:samFile.rfind('.')]+"_indels.txt" fhandle = open(out_file, "w") fout = DictWriter(fhandle, fieldnames=FIELDS_INDEL, delimiter='\t') fout.writeheader() indelsJunc = defaultdict(lambda: []) indelsTotal = Counter() for read in sam.fetch(): if read.is_unmapped: continue cigarLine = read.cigar ## reading splice junctions and storing information pos_start = read.pos # 0-based start spliceSites = [] # list of splice junctions (Interval(donor, acceptor)) for (cigarType,cigarLength) in cigarLine: if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'): pos_end = pos_start + cigarLength # 1-based end if (CIGAR_TYPE_LIST[cigarType] == 'N'): # skip (intron) spliceSites.append(Interval(pos_start, pos_end)) pos_start = pos_end ## reading indels, comparing with splice junctions and writing information pos_start = read.pos # 0-based start for (cigarType,cigarLength) in cigarLine: if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'): pos_end = pos_start + cigarLength # 1-based end if CIGAR_TYPE_LIST[cigarType] in ("I", "D"): # insertion or deletion pos_indel = pos_start # 0-based pos_end_indel = pos_start+1 if CIGAR_TYPE_LIST[cigarType]=='I' else pos_end # 1-based spliceSitesNearIndel = [] name = str(read.query_name).split("|")[0] # indels in the sequence indelsTotal[name] += 1 # indels near spliceSties for sj in spliceSites: if abs(pos_indel-sj.start) < MAX_DIST_FROM_JUNC or abs(pos_indel-sj.end+1) < MAX_DIST_FROM_JUNC or \ abs(pos_end_indel-1-sj.start) < MAX_DIST_FROM_JUNC or abs(pos_end_indel-sj.end) < MAX_DIST_FROM_JUNC: spliceSitesNearIndel.append(sj) rec = {'isoform': name, 'indelStart': pos_indel + 1, # make start 1-based 'indelEnd': pos_end_indel, 'nt': cigarLength, 'nearJunction': "FALSE", 'junctionStart': 'NA', 'junctionEnd': 'NA', 'indelType': 'insertion' if CIGAR_TYPE_LIST[cigarType]=='I' else 'deletion'} if len(spliceSitesNearIndel)==0: fout.writerow(rec) else: rec['nearJunction'] = 'TRUE' for sj in spliceSitesNearIndel: rec['junctionStart'] = sj.start + 1 # make start now 1-based rec['junctionEnd'] = sj.end # end is already 1-based fout.writerow(rec) indelsJunc[name].append(sj) if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'): pos_start = pos_end sam.close() fhandle.close() return dict(indelsJunc), indelsTotal