예제 #1
0
def make_exons_from_base_mapping(mapping, start, end, strand):
    """
    mapping is 0-based index on transcript  --> 0-based index  on genome
    however beware of strand!
    """

    output = [mapping[start]]
    for i in xrange(start+1, end):
        cur_pos, cur_is_junction= mapping[i]
        if cur_is_junction and mapping[i]!=output[-1]:
            # if the last position is the same, DON'T APPEND (was an indel)
            output.append(mapping[i])
    cur_pos, cur_is_junction = mapping[end]
    if mapping[end]!=output[-1]:
        output.append(mapping[end])

    # remember for Interval it is 0-based start, 1-based end
    # if len(output) is odd, must be 1bp into an exon
    # ex: [(xxx,True), (xxx,True), (xxx,False)] or
    #     [.....(xxx,True), xxx(True)]
    #print output
    if len(output)==1:
        output = [output[0], output[0]] # just duplicate it
    elif len(output)%2==1:
        if output[0][1] and output[1][1]:
            output.insert(0, output[0])
        elif output[-1][1] and output[-2][1]:
            output.append(output[-1])
    #    print "modified:", output
    if strand == '+':
        return [Interval(output[i][0],output[i+1][0]+1) for i in xrange(0, len(output), 2)]
    else: # - strand
        return [Interval(output[i][0],output[i-1][0]+1)  for i in xrange(len(output)-1,-1,-2)]
예제 #2
0
def get_exon_coordinates(exons, start, end):
    """
    Return the set of "exons" (genome location) that
    is where the nucleotide start-end is

    start is 0-based
    end is 1-based
    exons is a set of Interval (0-based start, 1-based end)
    """
    acc_lens = [0]  # ex: [0, 945, 1065, 1141, 1237] accumulative length of exons
    len_of_transcript = 0
    for e in exons:
        _len = e.end - e.start
        acc_lens.append(acc_lens[-1] + _len)
        len_of_transcript += _len
    # confirm that start-end is in the range of the transcript!
    assert (
        0 <= start < end <= len_of_transcript + 30
    )  # allow a 30-bp slack due to PacBio indels

    end = min(end, len_of_transcript)  # trim it to the end if necessary (for PacBio)

    i = bisect.bisect_right(acc_lens, start)
    j = bisect.bisect_right(acc_lens, end)

    # starts at i-th exon and ends at j-th exon, i and j are both 1-based
    # for the first exon, the offset is start-acc+e.start
    # for the last exon, the end point is end-acc+e.start
    if i == j:
        return [
            Interval(
                start - acc_lens[i - 1] + exons[i - 1].start,
                end - acc_lens[i - 1] + exons[i - 1].start,
            )
        ]
    else:
        if j >= len(exons):  # the end is the end
            return [
                Interval(start - acc_lens[i - 1] + exons[i - 1].start, exons[i - 1].end)
            ] + exons[i:]
        else:
            return (
                [
                    Interval(
                        start - acc_lens[i - 1] + exons[i - 1].start, exons[i - 1].end
                    )
                ]
                + exons[i : (j - 1)]
                + [
                    Interval(
                        exons[j - 1].start, end - acc_lens[j - 1] + exons[j - 1].start
                    )
                ]
            )
예제 #3
0
def read_scrubbed_junction_to_tree(junction_filename):
    tree = defaultdict(lambda: IntervalTree())
    for line in open(junction_filename):
        chrom, left, right, strand = line.strip().split('\t')
        left, right = int(left), int(right) # already 0-based start, 0-based end
        tree[chrom,strand].add(left, right, Interval(left, right))
    return tree
예제 #4
0
def scrub_ref_exons(r, tree):
    n = len(r.ref_exons)
    new_ref_exons = []
    cur_start = r.ref_exons[0].start
    for i in range(n - 1):
        donor = r.ref_exons[i].end - 1  # make it 0-based
        accep = r.ref_exons[i + 1].start  # start is already 0-based
        match = find_best_match_junction(tree[r.chr, r.strand], donor, accep)
        if match is None:
            print("donor-acceptor site {0},{1},{2}-{3} has no hit in tree!".format(\
                r.chr, r.strand, donor, accep), file=sys.stderr)
            return None

        new_ref_exons.append(Interval(cur_start, match.start + 1))
        cur_start = match.end
    new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end))
    return new_ref_exons
def scrub_ref_exons(r: Dict[str, Any],
                    tree: IntervalTree) -> Optional[List[Interval]]:
    n = len(r.ref_exons)
    new_ref_exons = []
    cur_start = r.ref_exons[0].start
    for i in range(n - 1):
        donor = r.ref_exons[i].end - 1  # make it 0-based
        accep = r.ref_exons[i + 1].start  # start is already 0-based
        match = find_best_match_junction(tree[r.chr, r.strand], donor, accep)
        if match is None:
            logger.info(
                f"donor-acceptor site {r.chr},{r.strand},{donor}-{accep} has no hit in tree!"
            )
            return None

        new_ref_exons.append(Interval(cur_start, match.start + 1))
        cur_start = match.end
    new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end))
    return new_ref_exons
예제 #6
0
def scrub_junctions(report_filename, output_filename, min_sample, min_transcript, accept_all_canonical):
    tree = defaultdict(lambda: IntervalTree())
    f = open(output_filename, 'w')
    for _label, junctions in read_junction_report(report_filename):
        good = scrub_junction_by_label(junctions, min_sample, min_transcript, accept_all_canonical)
        for r in good:
            a, b = int(r['left']), int(r['right']) # 0-based start, 0-basde end
            f.write("{chrom}\t{left}\t{right}\t{strand}\n".format(\
                chrom=r['chr'], left=r['left'], right=r['right'], strand=r['strand']))
            tree[r['chr'],r['strand']].add(a, b, Interval(a, b))
    f.close()
    return tree
def read_scrubbed_junction_to_tree(junction_filename):
    tree = defaultdict(lambda: IntervalTree())
    f = open(junction_filename)
    if not f.readline().startswith('track'): f.seek(0)
    for line in f:
        raw = line.strip().split('\t')
        if len(raw) == 4: chrom, left, right, strand = raw
        elif len(raw) == 6: chrom, left, right, _name, _count, strand = raw
        else:
            raise Exception, "Expects junction BED file to have either 4 or 6 columns! Saw {0}!".format(
                len(raw))
        left, right = int(left), int(
            right)  # already 0-based start, 0-based end
        tree[chrom, strand].add(left, right, Interval(left, right))
    return tree
예제 #8
0
def count_repeats_for_motif(seq, motif, tally, intervals=None):
    """
    seq --- plain sequence to search for the repeats (motifs)
    motif --- plain sequence of repeat, ex: CGG, AGG
    intervals --- 0-based start, 1-based end of Intervals to search motif in
    """
    if intervals is None:  # use the whole sequence
        intervals = [Interval(0, len(seq))]

    new_intl = []
    for intl in intervals:
        cur = seq[intl.start:intl.end]
        prev_end = intl.start
        found_flag = False
        for m in re.finditer(motif, cur):
            tally[motif].append(intl.start + m.start())
            if m.start() > prev_end:
                # new interval is prev_end (0-based), m.start() (1-based)
                new_intl.append(Interval(prev_end, intl.start + m.start()))
            prev_end = intl.start + m.end()
            found_flag = True
        if not found_flag:
            new_intl.append(intl)
    return new_intl
def read_probe_bed(bed_filename, start_base=0, end_base=1):
    """
    Read a probe BED file <chrom>, <start>, <end>
    Return dict of chrom --> IntervalTree w/ data=(index, interval)
    """
    tree = {}
    gene_info = {}
    i = 0
    reader = BED.SimpleBEDReader(bed_filename, start_base, end_base)
    for r in reader:
        if r.chr not in tree: tree[r.chr] = IntervalTree()
        tree[r.chr].add(r.start, r.end, (i, Interval(r.start, r.end)))
        if r.name is not None:
            gene_info[i] = r.name
        i += 1
    return tree, gene_info
def scrub_junctions(
    report_filename: Union[str, Path],
    output_filename: Union[str, Path],
    min_sample: int,
    min_transcript: int,
    accept_all_canonical: bool,
) -> IntervalTree:
    tree = defaultdict(IntervalTree)
    with open(output_filename, "w") as f:
        for _, junctions in read_junction_report(report_filename):
            good = scrub_junction_by_label(junctions, min_sample,
                                           min_transcript,
                                           accept_all_canonical)
            for r in good:
                a, b = int(r["left"]), int(
                    r["right"])  # 0-based start, 0-basde end
                f.write(
                    f"{r['chr']}\t{r['left']}\t{r['right']}\t{r['strand']}\n")
                tree[r["chr"], r["strand"]].add(a, b, Interval(a, b))
    return tree
def read_scrubbed_junction_to_tree(
        junction_filename: Union[str, Path]) -> IntervalTree:
    tree = defaultdict(IntervalTree)
    with open(junction_filename) as f:
        if not f.readline().startswith("track"):
            f.seek(0)
        for line in f:
            raw = line.strip().split("\t")
            if len(raw) == 4:
                chrom, left, right, strand = raw
            elif len(raw) == 6:
                chrom, left, right, _name, _count, strand = raw
            else:
                raise Exception(
                    f"Expects junction BED file to have either 4 or 6 columns! Saw {len(raw)}!"
                )
            left, right = int(left), int(
                right)  # already 0-based start, 0-based end
            tree[chrom, strand].add(left, right, Interval(left, right))
    return tree
예제 #12
0
 def __init__(self,chrom,start,end,value=None,strand=None):
     Interval.__init__(self,start,end,value)
     self.chrom=chrom
     self.strand=strand
예제 #13
0
 def __init__(self, start, stop, genome=None, **kws):
     self.genome = genome
     if 'strand' in kws:
         kws['strand'] = _convert_strand(kws['strand'])
     BaseInterval.__init__(self, start, stop, **kws)
예제 #14
0
For each interval in `bed1` count the number of intersecting regions in `bed2`.

usage: %prog bed1 bed2
"""

from __future__ import print_function

import sys

from bx.intervals import (Intersecter, Interval)

bed1, bed2 = sys.argv[1:3]

ranges = {}
for line in open(bed2):
    fields = line.strip().split()
    chrom, start, end, = fields[0], int(fields[1]), int(fields[2])
    if chrom not in ranges:
        ranges[chrom] = Intersecter()
    ranges[chrom].add_interval(Interval(start, end))

for line in open(bed1):
    fields = line.strip().split()
    chrom, start, end = fields[0], int(fields[1]), int(fields[2])
    other = " ".join(fields[3:])
    out = " ".join(fields[:3] + [other])
    if chrom in ranges:
        print(out, len(ranges[chrom].find(start, end)))
    else:
        print(out, 0)
예제 #15
0
 def __init__(self, start, stop, genome=None, **kws):
     self.genome = genome
     if 'strand' in kws:
         kws['strand'] = _convert_strand(kws['strand'])
     BaseInterval.__init__(self, start, stop, **kws)
예제 #16
0
def calc_indels_from_sam(samFile):
    """
    Given an aligned SAM file, calculate indel statistics.
    :param samFile: aligned SAM file
    :return: indelsJunc (dict of pbid --> list of junctions near indel), indelsTotal (dict of pbid --> total indels count)
    """
    sam = pysam.AlignmentFile(samFile, "r")
    out_file = samFile[:samFile.rfind('.')]+"_indels.txt"
    fhandle = open(out_file, "w")
    fout = DictWriter(fhandle, fieldnames=FIELDS_INDEL, delimiter='\t')
    fout.writeheader()

    indelsJunc = defaultdict(lambda: [])
    indelsTotal = Counter()


    for read in sam.fetch():
        if read.is_unmapped:
            continue
        cigarLine = read.cigar
        ## reading splice junctions and storing information
        pos_start = read.pos # 0-based start
        spliceSites = []  # list of splice junctions (Interval(donor, acceptor))

        for (cigarType,cigarLength) in cigarLine:
            if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'):
                pos_end = pos_start + cigarLength # 1-based end
                if (CIGAR_TYPE_LIST[cigarType] == 'N'): # skip (intron)
                    spliceSites.append(Interval(pos_start, pos_end))
                pos_start = pos_end

        ## reading indels, comparing with splice junctions and writing information
        pos_start = read.pos # 0-based start

        for (cigarType,cigarLength) in cigarLine:
            if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'):
                pos_end = pos_start + cigarLength # 1-based end

            if CIGAR_TYPE_LIST[cigarType] in ("I", "D"): # insertion or deletion
                pos_indel = pos_start  # 0-based
                pos_end_indel = pos_start+1 if CIGAR_TYPE_LIST[cigarType]=='I' else pos_end # 1-based
                spliceSitesNearIndel = []
                name = str(read.query_name).split("|")[0]

                # indels in the sequence
                indelsTotal[name] += 1

                # indels near spliceSties
                for sj in spliceSites:
                    if abs(pos_indel-sj.start) < MAX_DIST_FROM_JUNC or abs(pos_indel-sj.end+1) < MAX_DIST_FROM_JUNC or \
                       abs(pos_end_indel-1-sj.start) < MAX_DIST_FROM_JUNC or abs(pos_end_indel-sj.end) < MAX_DIST_FROM_JUNC:
                        spliceSitesNearIndel.append(sj)

                rec = {'isoform': name,
                       'indelStart': pos_indel + 1, # make start 1-based
                       'indelEnd': pos_end_indel,
                       'nt': cigarLength,
                       'nearJunction': "FALSE",
                       'junctionStart': 'NA',
                       'junctionEnd': 'NA',
                       'indelType': 'insertion' if CIGAR_TYPE_LIST[cigarType]=='I' else 'deletion'}
                if len(spliceSitesNearIndel)==0:
                    fout.writerow(rec)
                else:
                    rec['nearJunction'] = 'TRUE'
                    for sj in spliceSitesNearIndel:
                        rec['junctionStart'] = sj.start + 1  # make start now 1-based
                        rec['junctionEnd'] = sj.end          # end is already 1-based
                        fout.writerow(rec)
                        indelsJunc[name].append(sj)

            if CIGAR_TYPE_LIST[cigarType] in ('M', 'D', 'N', 'P', 'B'):
                pos_start = pos_end

    sam.close()
    fhandle.close()
    return dict(indelsJunc), indelsTotal