Example #1
0
def create_inttree_from_file(infile):
    """Create interval tree to store annotations

    Args:
    infile: handle of open BED file with annotations

    Return:
    dictionary {chromosome name : interval tree with coordinates}
    """
    genome = {}
    for line in infile:
        clean_line = line.strip()
        parts = clean_line.split()
        chrom, start, stop = parts[0], int(parts[1]), int(parts[2])
        name = parts[3]
        tree = None
        #if chromosome already in tree, index to this tree
        if chrom in genome:
            tree = genome[chrom]
        else:
            #first time we encounter chromosome, create a new interval tree
            tree = IntervalTree()
            genome[chrom] = tree
        #add interval to tree
        tree.add(start, stop, name)
    return genome
    def __init__(self, bed_file):
        """
       :param bed_file:
       :return interval tree data structure of gene ranges:
       """
        self.interval_tree_dict = dict()
        error_message = "Skipping line {0} too short with only {1} column(s).Check that it conforms to bed format:\n{2}"
 
        with open(bed_file, 'r') as bed_file_Handler:
            for count, line in enumerate(bed_file_Handler):
                segmentProperties = line.split("\t")
                numberOfColumns = len(segmentProperties)
                try:
                    chromosome, segment_start, segment_end, name = segmentProperties[:4]
                except ValueError:
                    print error_message.format(count + 1, numberOfColumns, line.strip())
                    continue
 
                segment_start, segment_end = int(segment_start), int(segment_end)
 
                if chromosome in self.interval_tree_dict:
                    tree = self.interval_tree_dict[chromosome]
                else:
                    tree = IntervalTree()
                    self.interval_tree_dict[chromosome] = tree
 
                tree.add(segment_start, segment_end, tuple(segmentProperties[:4]))
    def __init__(self, bed_file_path):
 
        self.interval_tree_dict = dict()
        error_message = "Skipping line {0} - too short, only {1} column(s):\n{2}"
 
        with open(bed_file_path, 'r') as bed_file:
            for count, line in enumerate(bed_file):
                split_line = line.split("\t")
                number_of_columns = len(split_line)
 
                try:
                    chromosome, start, end, name = split_line[:4]
                except ValueError:
                    print error_message.format(count+1, number_of_columns, line.strip())
                    continue
                   
                start, end = int(start), int(end)
                tree = None
 
                if chromosome in self.interval_tree_dict:
                    tree = self.interval_tree_dict[chromosome]
                else:
                    tree = IntervalTree()
                    self.interval_tree_dict[chromosome] = tree
 
                tree.add(start, end, tuple(split_line[:4]))
def process_file(chr_features, pc_file):
    if pc_file[-2:] == 'gz':
        pc_f = gzip.open(pc_file)
    elif os.path.isfile(pc_file):
        pc_f = open(pc_file)
    elif os.path.isfile(pc_file + '.gz'):
        pc_f = gzip.open(pc_file + '.gz')

    chrom = os.path.split(pc_file)[1].split('.')[0]
    print >> sys.stderr, 'Processing %s ...' % chrom,

    block_start = 0
    block_scores = []

    line = pc_f.readline()
    while line:
        if line.startswith('fixedStep'):
            if block_scores:
                intersect_scores(chr_features.get(chrom, IntervalTree()),
                                 block_start, block_scores)

            a = line.split()
            chrom = a[1][6:]
            block_start = int(a[2][6:])
            block_scores = []
        else:
            block_scores.append(float(line.rstrip()))

        line = pc_f.readline()

    intersect_scores(chr_features.get(chrom, IntervalTree()), block_start,
                     block_scores)

    pc_f.close()
    print >> sys.stderr, 'Done'
Example #5
0
def read_in_somatic_vcf_file(somatic_snv_files, clonal_percs, query_chr, truth_set_cn_calls, output_dir, subsample_somatic_snvs):
    """ read clonal somatic SNV vcf files"""
    fsf = open(os.path.join(output_dir,'forced_somatic_snv_frequencies_' + str(query_chr) + '.json'), 'w')
    print("ri ",query_chr)
    
    h = IntervalTree()
    h2={}

    for (somatic_snv_file, clonal_perc) in zip(somatic_snv_files, clonal_percs):
    # for now just do SNVs as adding in indels involve increasing the size of reads which could cause issues; 
    # thinking about it probably wouldnt - quite faffy though
        FH = open(somatic_snv_file,'r')
        for line in FH:
            if re.match('#',line):
                continue
            random_no = random.random()
            if random_no > subsample_somatic_snvs: 
                continue            
            (chrom, pos, id, ref, alt, qual, filter, info, format, normal, tumor)=line.strip().split()
            pos=int(pos)
            if chrom != query_chr:
                continue

            if format != 'DP:FDP:SDP:SUBDP:AU:CU:GU:TU':
                sys.exit('vcf format not the usual'+'DP:FDP:SDP:SUBDP:AU:CU:GU:TU')
            print("tumor ",tumor)
            (DP,FDP,SDP,SUBDP,AU,CU,GU,TU) = tumor.strip().split(':')
            cov=float(DP)

            if ref =='A':
                l=[CU,GU,TU]
            if ref =='C':
                l=[AU,GU,TU]
            if ref =='G':
                l=[CU,AU,TU]
            if ref =='T':
                l=[CU,GU,AU] #should be a pithy python way to do this but this'll do for now

            (first, second, third)=sorted([int(cv.split(',')[0] ) for cv in l], reverse=True) #just using first tier reads for now

            if random.random() > 0.5:
                somatic_haplotypeCN = 'firsthaplotype_CN'
            else:
                somatic_haplotypeCN = 'secondhaplotype_CN'

            print("pos ",pos, "shcn ", somatic_haplotypeCN , " r ")

            region_CN = 2
            for region in truth_set_cn_calls[query_chr]:
                if pos >= region['start'] and pos <= region['end']:
                    region_CN = region[somatic_haplotypeCN]
            somatic_mutation_freq = float(assign_freq_based_on_ploidy(region_CN))
            somatic_mutation_freq *= float(clonal_perc)
            h.add(pos, pos,{'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN}) #theoretically bug: could have snv and indel at same pos #also bug if snp or indel last/first on read
            h2[pos] = {'pos':pos, 'ref':ref, 'alt':alt, 'line':line, 'freq':somatic_mutation_freq, 'somatic_haplotype':somatic_haplotypeCN}

    pprint.pprint(h2)
    json.dump(h2, fsf, indent=4, sort_keys=True)
    return h
Example #6
0
 def __init__(self, gtf_filename):
     self.gtf_filename = gtf_filename
     self.genome = defaultdict(lambda: IntervalTree()) # chr --> IntervalTree --> (0-start, 1-end, transcript ID)
     self.transcript = defaultdict(lambda: IntervalTree()) # tID --> IntervalTree --> (0-start, 1-end, {'ith': i-th exon, 'eID': exon ID})
     self.exon = defaultdict(lambda: []) # (0start,1end) --> list of (tID, ith-exon, chr)
     self.transcript_info = {} # tID --> chr
     
     self.readGTF(self.gtf_filename)
Example #7
0
 def add(self,region):
     """Add a new Region-like object to the RegionList"""
     try:
         self[region.chromosome].insert(region.rbeg,region.rend,region)
     except KeyError:
         itree = IntervalTree()
         itree.insert(region.rbeg,region.rend,region)
         self[region.chromosome]=itree
def main(argv):
    bedfile = argv[1]
    wigfile = argv[2]
    intersecter = IntervalTree()
    for peak in parse_wig(wigfile):
        intersecter.insert_interval(peak)

    report(intersecter, bedfile)
Example #9
0
 def setUp(self):
     iv = IntervalTree()
     iv.add_interval(Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv.add_interval(f)
     self.intervals = iv
Example #10
0
def main(argv):
    bedfile = argv[1]
    wigfile = argv[2]
    intersecter = IntervalTree()
    for peak in parse_wig(wigfile):
        intersecter.insert_interval(peak)

    report(intersecter, bedfile)
Example #11
0
 def test_empty(self):
     iv = IntervalTree()
     self.assertEqual([], iv.find(100, 300))
     self.assertEqual([], iv.after(100))
     self.assertEqual([], iv.before(100))
     self.assertEqual([], iv.after_interval(100))
     self.assertEqual([], iv.before_interval(100))
     self.assertEqual([], iv.upstream_of_interval(100))
     self.assertEqual([], iv.downstream_of_interval(100))
     self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
Example #12
0
def btab_reclist_to_interval_list_0basedStart(recs):
    """
    Return chr, list of IntervalNode
    """
    tree = IntervalTree()
    for rec in recs:
        tree.insert(rec['rStart1']-1, rec['rEnd1'])
    path = []
    tree.traverse(path.append)
    chr = recs[0]['chr']
    return chr, path
Example #13
0
def load_exons_and_genes(genesF):
    gtrees={}
    etrees={}
    genesIN = open(genesF)
    #map transcript (isoform) name to cluster_id ("gene")
    t_to_gene_map = {}
    #load individual transcripts (isoforms)
    for line in genesIN:
        #skip header
        if line[0] == 'c':
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (cluster_id,tname,refid,strand,tstart,tend) = fields[:6]
        refid = refid.replace("chr","")
        eStarts = fields[9].split(',')
        eEnds = fields[10].split(',')
        alignID = fields[12]
        #now save the exons as intervals
        if refid not in etrees:
            etrees[refid] = IntervalTree()
        #use 1-based closed interval
        tstart=int(tstart)+1 
        for (eStart,eEnd) in zip(eStarts,eEnds):
            if len(eStart) == 0:
                continue
            #use 1-based closed interval
            eStart=int(eStart)+1
            #sys.stderr.write("%s %s %s\n"%(eStart,eEnd,cluster_id))
            #must adjust for the open intervals (both) of the interval tree
            itv = Interval(eStart-1,int(eEnd)+1, value=[cluster_id,alignID,strand])
            etrees[refid].insert_interval(itv)
        #now map to the cluster_id and figure whether we can increase
        #the longest transcript coordinate span with these coordinates
        tend = int(tend)
        if cluster_id not in t_to_gene_map:
            t_to_gene_map[cluster_id]=[tstart,tend,refid]
        if tstart < t_to_gene_map[cluster_id][0]:
            t_to_gene_map[cluster_id][0] = tstart
        if tend > t_to_gene_map[cluster_id][1]:
            t_to_gene_map[cluster_id][1] = tend
    genesIN.close()
    #now convert the cluster (gene) coordinate extents to intervals
    for (cluster_id,span) in t_to_gene_map.iteritems():
        (st,en,refid) = span
        if refid not in gtrees:
            gtrees[refid] = IntervalTree()
        #sys.stderr.write("%d %d %s\n"%(st,en,cluster_id))
        #must adjust for the open intervals (both) of the interval tree
        itv = Interval(int(st)-1,int(en)+1,value=cluster_id)
        gtrees[refid].insert_interval(itv)
    return (etrees,gtrees)
Example #14
0
 def test_empty(self):
     iv = IntervalTree()
     self.assertEqual([], iv.find(100, 300))
     self.assertEqual([], iv.after(100))
     self.assertEqual([], iv.before(100))
     self.assertEqual([], iv.after_interval(100))
     self.assertEqual([], iv.before_interval(100))
     self.assertEqual([], iv.upstream_of_interval(100))
     self.assertEqual([], iv.downstream_of_interval(100))
     self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
def exon_matching(
    exon_tree: IntervalTree,
    ref_exon: Interval,
    match_extend_tolerate_left: int,
    match_extend_tolerate_right: int,
    intervals_adjacent: bool = True,
) -> List[IntervalTree]:
    """
    exon_tree --- an IntervalTree made from .baseC/.altC using exon detection; probably only short read data
    ref_exon --- an Interval representing an exon; probably from PacBio
    match_extend_tolerate --- maximum difference between the matched start/end

    find a continuous exon path (consisting of 1 or more nodes for which the intervals must be adjacent)
    in exon_tree that matches to ref_exon
    """
    matches = exon_tree.find(ref_exon.start, ref_exon.end)
    if len(matches) == 0:  # likely due to very low coverage on transcript
        return None
    # check that all matches are adjacent (no splicing! this just one integral exon)
    if (not intervals_adjacent) or c_branch.intervals_all_adjacent(matches):
        # check if the ends differ a little, if so, extend to min/max
        for i in range(len(matches)):
            d_start = abs(matches[i].start - ref_exon.start)
            # print "matching {0} to {1}".format(matches[i].start, ref_exon.start)
            # pdb.set_trace()
            if (d_start <= match_extend_tolerate_left
                ):  # now find the furthest end that satisfies the results
                for j in range(len(matches) - 1, i - 1, -1):
                    if (abs(matches[j].end - ref_exon.end) <=
                            match_extend_tolerate_right):
                        return matches[i:(j + 1)]
        return None
    else:  # ack! could not find evidence for this :<
        return None
Example #16
0
def load_repeats(repeatsF):
    rtrees = {}
    gtype = ""
    tname = ""
    seen = set()
    repeatsIN = open(repeatsF)
    FIRST = True
    for line in repeatsIN:
        #skip header
        if FIRST:
            FIRST = False
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (refid, st, en) = fields[5:8]
        orient = fields[9]
        tname = fields[10]
        gtype = fields[11]

        #present already in interval tree
        if "%s_%s" % (st, en) in seen:
            continue
        seen.add("%s_%s" % (st, en))
        if refid not in rtrees:
            rtrees[refid] = IntervalTree()
        itv = Interval(int(st), int(en), value=[tname, gtype])
        rtrees[refid].insert_interval(itv)
    repeatsIN.close()
    return rtrees
Example #17
0
def BED_to_interval_tree(BED_file):
    """
    Creates an index of intervals, using an interval tree, for each BED entry

    :param BED_file: file handler of a BED file

    :return interval tree
    """
    from bx.intervals.intersection import IntervalTree, Interval

    bed_interval_tree = {}
    for line in BED_file:
        if line[0] == "#":
            continue
        fields = line.strip().split()
        chrom, start_bed, end_bed, = fields[0], int(fields[1]), int(fields[2])

        if chrom not in bed_interval_tree:
            bed_interval_tree[chrom] = IntervalTree()

        # skip if a region overlaps with a region already seen
        """
        if len(bed_interval_tree[chrom].find(start_bed, start_bed + 1)) > 0:
            continue
        """
        bed_interval_tree[chrom].add_interval(Interval(start_bed, end_bed))

    return bed_interval_tree
def main(argv):
    """reads in two haploid bam files so odd ploidy can be modelled"""

    # get input params
    options = get_commandline_options()
    output_bam_file = options.output_bam_file
    bam_file_haplotype1 = options.bam_file_haplotype1
    bam_file_haplotype2 = options.bam_file_haplotype2
    query_chr = options.query_chr
    truth_file = options.truth_file
    mutate_somatic_variants = options.mutate_somatic_variants
    somatic_snv_files = options.somatic_snv_files
    clonal_percs = options.clonal_percs
    subsample_somatic_snvs = options.subsample_somatic_snvs
    ploidy_depth = options.ploidy_depth
    bam_depth = options.bam_depth
    truth_set_cn_calls = read_in_truth_set(truth_file)
    snv_out_dir = os.path.dirname(output_bam_file) + "/forced_somatic_snv_frequencies"
    somatic_variants = IntervalTree()
    if options.rand_seed:
        random.seed(options.rand_seed)
    if mutate_somatic_variants:
        somatic_variants = read_in_somatic_vcf_file(
            somatic_snv_files, clonal_percs, query_chr, truth_set_cn_calls,
            snv_out_dir, subsample_somatic_snvs)  # somatic_indel_file = '' if not defined and not used anyway at the mo
    create_synthetic_bam(bam_file_haplotype1, bam_file_haplotype2, output_bam_file, query_chr, mutate_somatic_variants,
                         somatic_variants, truth_set_cn_calls, ploidy_depth, bam_depth)
    print("finished first haplotype")
Example #19
0
def add_intervals(genes, exons):
    '''Add exons to interval tree.'''
    for exon in exons:
        if exon.value['chrom'] not in genes:
            genes[exon.value['chrom']] = IntervalTree()

        genes[exon.value['chrom']].insert_interval(exon)
Example #20
0
def intersect_scores(chr_features, interval2lnc, lnc_cons, chrom, block_start,
                     block_scores):
    features = chr_features.get(chrom, IntervalTree())
    block_end = block_start + len(block_scores) - 1
    for overlap_interval in features.find(block_start,
                                          block_start + len(block_scores)):
        # block internal to invterval
        if overlap_interval.start <= block_start <= block_end <= overlap_interval.end:
            start = 0
            end = len(block_scores)

        # interval internal to block
        elif block_start <= overlap_interval.start <= overlap_interval.end <= block_end:
            start = overlap_interval.start - block_start
            end = start + overlap_interval.end - overlap_interval.start + 1

        # left block overlap interval
        elif block_start < overlap_interval.start:
            start = overlap_interval.start - block_start
            end = start + block_end - overlap_interval.start + 1

        # right block overlap interval
        else:
            start = 0
            end = overlap_interval.end - block_start + 1

        #start = overlap_interval.start - block_start
        #end = start + overlap_interval.end - overlap_interval.start

        for tid in interval2lnc[(chrom, overlap_interval.start,
                                 overlap_interval.end)]:
            lnc_cons[tid] += block_scores[start:end]
Example #21
0
def load_repeats(repeatsF):
    rtrees={}
    gtype = ""
    tname = ""
    seen = set()
    repeatsIN = open(repeatsF)
    FIRST = True
    for line in repeatsIN:
        #skip header
        if FIRST:
            FIRST = False
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (refid,st,en) = fields[5:8]
        refid = refid.replace("chr","")
        strand = fields[9]
        tname = fields[10]
        gtype = fields[11]
        #use 1-based closed interval
        st=int(st)+1
        #present already in interval tree
        if "%d_%s" % (st,en) in seen:
            continue
        seen.add("%d_%s" % (st,en))
        if refid not in rtrees:
            rtrees[refid] = IntervalTree()
        #must adjust for the open intervals (both) of the interval tree
        itv = Interval(st-1,int(en)+1, value=[tname,gtype,strand])
        rtrees[refid].insert_interval(itv)
    repeatsIN.close()
    return rtrees
Example #22
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i * i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i * i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(
                Interval(i + 40, i + 50, value=dict(astr=str(i * i))))
            iv.add_interval(
                Interval(i + 60, i + 70, value=dict(astr=str(i * i))))

            n += 4
        self.intervals = self.iv = iv
        self.nintervals = n
def make_intervals(hindiii_genome):
    '''
    Need to convert to 0-based for bx-python overlaps
    '''
    #make genome hindiii fragments into intervals
    genome = dict()
    for frag in hindiii_genome.values():
        tree = None
        # one interval tree per chromosome
        if frag.chrom in genome:
            tree = genome[frag.chrom]
        else:
            # first time we've encountered this chromosome, create an interval tree
            tree = IntervalTree()
            genome[frag.chrom] = tree
        # index the feature
        tree.add(int(frag.start) - 1, int(frag.end), frag.fragment_id)

    return genome
Example #24
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end + 1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start - 1, end - 1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [
                len(interval_tree.find(pos, pos)) for pos in coords_range
            ]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
Example #25
0
    def add(self, chrom, element):
        """insert an element. use this method as the IntervalTree one.
        this will simply call the IntervalTree.add method on the right tree

        :param chrom: chromosome
        :param element: the argument of IntervalTree.insert_interval
        :return: None
        """

        self._trees.setdefault(chrom, IntervalTree()).insert_interval(element)
Example #26
0
def plot_coverage(coords, bams):
    '''Given the name of a DNA coordinates firl and a list of bam file names,
    plot the read aligment coverage for each bam file for each coordinate.
    One graph per coordinate will be generated. The coverage for each
    BAM file for a given coordinate will be plotted on the same graph.
    The coordinates file should be in TSV format.'''
    coords = get_coords(coords)
    for chrom, start, end in coords:
        logging.info("processing coord {} {} {}".format(chrom, start, end))
        # Start plotting the graph and generate a name for the output file
        graph_filename = start_graph(chrom, start, end)
        coords_range = range(start, end+1)
        for bam_filename in bams:
            # interval tree tracks the start and end mapped coordinates
            # of each read in the bam file that lies within our region
            # of interest.
            interval_tree = IntervalTree()
            with pysam.Samfile(bam_filename, "rb") as bam:
                logging.info("processing bam file {}".format(bam_filename))
                # Collect all the reads from the BAM file which lie in
                # the region of interest.
                # fetch uses 0-based indexing. Our input coordinates are
                # in 1-based coordinates.
                reads = bam.fetch(chrom, start-1, end-1)
                # Insert the start and end of each aligned read into the
                # interval tree.
                for read in reads:
                    if len(read.positions) > 0:
                        # Add 1 to convert from 0-based to 1-based coordinates
                        first_pos = read.positions[0] + 1
                        last_pos = read.positions[-1] + 1
                        interval_tree.add(first_pos, last_pos, None)
            # For each base position in our region of interest,
            # count the number of reads which overlap this position.
            # This computes the coverage for each position in the region.
            counts = [len(interval_tree.find(pos, pos))
                      for pos in coords_range]
            # Plot the coverage information for this bam file
            legend_text = bam_name_legend(bam_filename)
            plot_graph(counts, coords_range, legend_text)
        # Close the drawing of the graph for this set of coordinates
        end_graph(graph_filename)
Example #27
0
def parse_blast(blast_str, qfeat):
  """takes a blast file and cns_pair and sees if the query cns intersects with 
  any of the cns found"""
  scns_inteval = IntervalTree()
  for line in blast_str.split("\n"):
    if "WARNING" in line: continue
    if "ERROR" in line: continue
    line = line.split("\t")
    locus = map(int, line[6:10])
    locus.extend(map(float, line[10:]))
    
    s_start, s_end = locus[:2]
    s_start = min(int(s_start), int(s_end))
    s_end = max(int(s_start), int(s_end))
    scns_inteval.insert_interval(Interval(s_start, s_end))

  q_start = min(int(qfeat['start']), int(qfeat['end']))
  q_end = max(int(qfeat['start']), int(qfeat['end'])) 
  intersecting_cns = scns_inteval.find(q_start, q_end)
  return intersecting_cns
Example #28
0
 def insert(self, chrom, start, end, gene_id, gene_name):
     from bx.intervals.intersection import Interval
     from bx.intervals.intersection import IntervalNode
     from bx.intervals.intersection import IntervalTree
     if chrom in self.chroms:
         self.chroms[chrom].insert(
             start, end, MyInterval(start, end, [gene_id, gene_name]))
     else:
         self.chroms[chrom] = IntervalTree()
         self.chroms[chrom].insert(
             start, end, MyInterval(start, end, [gene_id, gene_name]))
def main():
    infile = sys.argv[1]
    qfile = sys.argv[2]

    genes = {}
    for chr, gene in parse_gene_coordinate(infile):
        if chr not in genes:
            genes[chr] = IntervalTree()

        genes[chr].insert_interval(gene)

    find_overlap(genes, qfile)
Example #30
0
def index_gff3(gff3_file_path):
    #following an example from https://malariageninformatics.wordpress.com/2011/07/07/using-interval-trees-to-query-genome-annotations-by-position/
    # dictionary mapping chromosome names to interval trees
    genome = dict()

    # parse the annotations file (GFF3) and build the interval trees
    gff = pd.read_csv(gff3_file_path, sep="\t", header=None, comment="#")
    for idx, row in gff.iterrows():
        if args.tag is not None and row[2] != args.tag:
            continue
        seqid = row[0]
        start = int(row[3])
        end = int(row[4])
        tree = None
        # one interval tree per chromosome
        if seqid in genome:
            tree = genome[seqid]
        else:
            # first time we've encountered this chromosome, create an interval tree
            tree = IntervalTree()
            genome[seqid] = tree
        # index the feature
        if args.attribute is None and args.join is None:
            tree.add(start, end, row)
        else:
            attr = row[8].split(";")
            o = list()
            for n in attr:
                k, v = n.split("=")
                if k == args.attribute or k == args.join:
                    o.append(v)
            o = ",".join(o)
            tree.add(start, end, o)
    return genome
Example #31
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i*i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i*i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(Interval(i + 40, i + 50,
                value=dict(astr=str(i*i))))
            iv.add_interval(Interval(i + 60, i + 70,
                value=dict(astr=str(i*i))))

            n += 4 
        self.intervals = self.iv = iv
        self.nintervals = n
Example #32
0
def index_gtf(gtf_file_path):
    # dictionary mapping chromosome names to interval trees
    genome = dict()
    #parse the annotations file (Gtf) and build the interval trees
    with open(gtf_file_path, "r") as annotations_file:
        reader = csv.reader(annotations_file, delimiter = '\t')
        for row in reader:
            if len(row) == 9 and not row[0].startswith('##'):
                seqid = row[0]
                start = int(row[3])
                end  = int(row[4])
                tree = None
                # build one interval tree per chromosome 
                if seqid in genome:
                    tree = genome[seqid]
                else:
#first time we've encoutered this chromosome, creat an interval tree
                    tree = IntervalTree()
                    genome[seqid] = tree
#index the feature
                tree.add(start, end, tuple(row))
    return genome
Example #33
0
 def setUp(self):
     iv = IntervalTree()
     iv.add_interval(Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv.add_interval(f)
     self.intervals = iv
Example #34
0
class IntervalTreeOverlapDetector(OverlapDetector):
    def __init__(self, excludedSegments=None):
        from bx.intervals.intersection import IntervalTree
        self._intervalTree = IntervalTree()
        if excludedSegments:
            for start, end in excludedSegments:
                self._intervalTree.add(start, end)

    def overlaps(self, start, end):
        return bool(self._intervalTree.find(start, end))

    def addSegment(self, start, end):
        self._addElementHandleBxPythonZeroDivisionException(start, end)
        # self._intervalTree.add(start, end)

    def _addElementHandleBxPythonZeroDivisionException(self,
                                                       start,
                                                       end,
                                                       nrTries=10):
        """
        DivisionByZero error is caused by a bug in the bx-python library.
        It happens rarely, so we just execute the add command again up to nrTries times
        when it does. If it pops up more than 10 times, we assume something else is wrong and
        raise.
        """
        cnt = 0
        while True:
            cnt += 1
            try:
                self._intervalTree.add(start, end)
            except Exception as e:
                from gold.application.LogSetup import logMessage, logging
                logMessage("Try nr %i. %s" % (cnt, str(e)), level=logging.WARN)
                if cnt > nrTries:
                    raise e
                continue
            else:
                break
Example #35
0
def get_nearest_gene_intervall_tree(depict_gene_annotation_file, depictgenes):
    ens_col = 0
    chr_col = 6
    str_col = 1
    sta_col = 2
    end_col = 3
    trees = {}
    for i in range(1, 23, 1):
            trees[str(i)] = IntervalTree()
    with open (depict_gene_annotation_file,'r') as infile:
        for line in infile.readlines()[1:]:
            words = line.strip().split('\t')
            if words[ens_col] in depictgenes and words[chr_col] in [str(x) for x in range(1,23,1)]:
                tss = int(words[sta_col]) if words[str_col] == '1' else int(words[end_col])
                trees[words[chr_col]].insert_interval(Interval(tss, tss, value=words[ens_col])) if words[ens_col] in depictgenes and words[chr_col] in [str(x) for x in range(1,23,1)] else None
    return trees
Example #36
0
def index_genes(G, window=0):

    G = G.GroupBy(_.seqname).Sort(_.start)
    G = G.Get(_.seqname, _.name, _.start, _.end).Flat()

    chrs = {}
    for (seqname, name, start, end) in zip(*G()):
        if seqname not in chrs:
            chrs[seqname] = IntervalTree()
        #fi

        chrs[seqname].add(start - window, end + window, (name, start, end))
        print seqname, start, end, name
    #efor

    return chrs
Example #37
0
def btab_reclist_to_interval_list_0basedStart(recs):
    """
    Return chr, list of IntervalNode
    """
    tree = IntervalTree()
    for rec in recs:
        tree.insert(rec["rStart1"] - 1, rec["rEnd1"])
    path = []
    tree.traverse(path.append)
    seqname = recs[0]["chr"]
    return seqname, path
Example #38
0
def btab_reclist_to_interval_list_0basedStart(recs):
    """
    Return chr, list of IntervalNode
    """
    tree = IntervalTree()
    for rec in recs:
        tree.insert(rec['rStart1'] - 1, rec['rEnd1'])
    path = []
    tree.traverse(path.append)
    chr = recs[0]['chr']
    return chr, path
Example #39
0
def read_genes(filename):
    chroms = {}
    with open(filename) as file:
        reader = csv.DictReader(file, delimiter="\t")
        for row in reader:
            try:
                chrom = row['chromosome']
                start = int(row['GRCh37 start'])
                end = int(row['GRCh37 end'])
                symbol = row['symbol']
                tier = int(row['tier'])
                if chrom not in chroms:
                    chroms[chrom] = IntervalTree()
                chroms[chrom].insert(start, end, (symbol, tier))
            except:
                pass
    return chroms
Example #40
0
def merge_gene_into_cluster(args):
    """
    Merge external genes into clusters of genes from clusterGenes
    """
    args = parse_args(args)
    f_gl = args.f_gl
    f_gl_gene = args.f_gl_gene
    f_ext_gene = args.f_ext_gene
    f_out = args.f_out
    f_out_no_overlap = args.f_out_no_overlap

    print("Loading gl gene ...")
    gl_gene_dict = load_gene(f_gl_gene)

    print("Loading gl ...")
    cluster_dict = dict()
    for cluster, gene in load_gl(f_gl):
        if cluster not in cluster_dict.keys():
            new_cluster = Cluster(cluster)
            cluster_dict[cluster] = new_cluster
        assert gene in gl_gene_dict.keys(), "Cannot find {0} in {1}".format(
            gene, f_gl_gene)
        cluster_dict[cluster].add_gene(gl_gene_dict[gene])

    cluster_list = list(cluster_dict.values())
    # Build Chrom:Strand IntervalTree
    ctree = dict()
    for cluster in cluster_list:
        cluster.build_exon_block()
        key = (cluster.chrom, cluster.strand)
        if key not in ctree:
            ctree[key] = IntervalTree()
        ctree[key].insert(cluster.start, cluster.end, cluster)

    print("Loading external gene ...")
    ext_gene = list(load_gene(f_ext_gene).values())

    print("Assigning gene into clusters ...")
    with open(f_out_no_overlap, "w") as f:
        assign_gene_to_cluster(ext_gene, ctree, f)

    with open(f_out, "w") as f:
        for cluster in cluster_list:
            cluster.write_mapping(f)
def _bx(es):
    t = IntervalTree()
    for e in es:
        t.add(e[0], e[1], e)
        c = len(t.find(e[0], e[1]))
def index_annotation_file(annotation_file_path, annotation_type):
    """"Parses a annotation file and builds an interval tree"""

    #dictionary mapping chromosome names to interval trees, collecting geneID info
    genome = dict()
    #dictionary mapping chromosmoes names to interval trees, collecting transcriptID info
    transcriptome = dict()
    #dictionnary mapping transcript info
    transcripts_info = dict()
    #dictionary mapping chromosmoes names to interval trees, collecting exon number info
    exome = dict()
    #dictionnary mapping coding region info info
    coding_region_info = defaultdict(dict)

    with open(annotation_file_path,  'r') as annotation_file:

        reader = csv.reader(annotation_file, delimiter='\t')
        
        for line in reader:

            #Start with blank tree for each line
            tree_gene = None
            tree_transcript = None
            tree_exon = None

            if annotation_type == 'ref_gene':

                gene = ref_gene_parser(line)

                #one interval tree per chromosome
                if gene['chrom'] in genome:

                    tree_gene = genome[gene['chrom']]
                    tree_transcript = transcriptome[gene['chrom']]
                    tree_exon = exome[gene['chrom']]

                else:
                
                    #Chromosome not seen previously, create interval tree key
                    tree_gene = IntervalTree()
                    tree_transcript = IntervalTree()
                    tree_exon = IntervalTree()
                    genome[gene['chrom']] = tree_gene
                    transcriptome[gene['chrom']] = tree_transcript
                    exome[gene['chrom']] = tree_exon
                
                #index the feature
                tree_gene.add(gene['start'], gene['stop'], gene['gene_id'])
                tree_transcript.add(gene['start'], gene['stop'], gene['transcript_id'])
                
                #Fasta file exists
                if args.genome_reference: 
                    
                    transcripts_info[ gene['transcript_id'] ] = gene['transcript_id']

                    #Collect fasta sequence and coding region 
                    coding_region_info[ gene['transcript_id'] ]['fasta'] = read_fasta_file(fasta_path, gene['chrom'], int(gene['cds_start']), int(gene['cds_stop']))
                    coding_region_info[ gene['transcript_id'] ]['cds_start'] = gene['cds_start']
                    coding_region_info[ gene['transcript_id'] ]['cds_stop'] = gene['cds_stop']
                    coding_region_info[ gene['transcript_id'] ]['strand'] = gene['strand']
                
                mrna_fasta = []
                position_mrna = 0
                for exon in gene['exon_start']:

                    tree_exon.add(int(gene['exon_start'][exon]), int(gene['exon_stop'][exon]), exon) 
                    
                    #print(gene['transcript_id'], exon)
                    if coding_region_info[ gene['transcript_id'] ]['fasta']:

                        start_fasta = 0
                        stop_fasta = 0

                        if "+" in gene['strand']:

                            #Within coding region
                            if (int(gene['exon_start'][exon]) > gene['cds_start']) and (int(gene['exon_stop'][exon]) < gene['cds_stop']):
                                
                                start_fasta = int(gene['exon_start'][exon]) - gene['cds_start']
                                stop_fasta = int(gene['exon_stop'][exon]) - gene['cds_start']
                                position_exon = range(int(gene['exon_start'][exon]), int(gene['exon_stop'][exon]))
                                
                                position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))
                                
                                #Upstream of coding region
                            elif (int(gene['exon_stop'][exon]) < gene['cds_start']):
                                
                                start_fasta = 0
                                stop_fasta = 0
                                
                            #Downstream of coding region
                            elif (int(gene['exon_start'][exon]) > gene['cds_stop']):
                                
                                start_fasta = 0
                                stop_fasta = 0
                            
                            #Start downstream of cds
                            elif int(gene['exon_start'][exon]) < gene['cds_start']:
                            
                                start_fasta = 0
                            
                                #Exon encompasses whole cds
                                if ( (int(gene['exon_stop'][exon]) > gene['cds_start']) and (int(gene['exon_stop'][exon]) > gene['cds_stop']) ):

                                    stop_fasta = gene['cds_stop'] - gene['cds_start']
                                    position_exon = range(gene['cds_start'], gene['cds_stop'])

                                #Finish upstream of cds start, but less than cds stop (handled above)
                                elif int(gene['exon_stop'][exon]) > gene['cds_start']:
                                
                                    stop_fasta = int(gene['exon_stop'][exon]) - gene['cds_start']
                                    position_exon = range(gene['cds_start'], int(gene['exon_stop'][exon]))                                
                                
                                    position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))
                            
                                elif int(gene['exon_stop'][exon]) > gene['cds_stop']:

                                    stop_fasta = gene['cds_stop'] - gene['cds_start']

                                    if int(gene['exon_start'][exon]) < gene['cds_stop']:
                                
                                        start_fasta = int(gene['exon_start'][exon]) - gene['cds_start']
                                        position_exon = range(int(gene['exon_start'][exon]), gene['cds_stop'])
                                        
                                        position_mrna = map_genomic_position_to_mrna_position(coding_region_info, gene['strand'], gene['transcript_id'], position_mrna, position_exon, int(gene['exon_start'][exon]))

                            mrna_fasta.append(coding_region_info[ gene['transcript_id'] ]['fasta'][start_fasta:stop_fasta])

                    coding_region_info[gene['transcript_id'] ]['mRNA'] = ''.join(mrna_fasta)
                
    return genome, transcriptome, exome, transcripts_info, coding_region_info
Example #43
0
def resolve_conflicts(pfam_hit_dict,minDomSize = 9,verbose=False):
    '''
    :param pfam_hit_dict: dictionary of hits for the gene in the following format
    hit start,hit end : int
    hit id : str
    score, model coverage percent : float
    {(hit start,hit end):('hit id',score,model coverage percent)}
    :param minDomSize: int, the minimum window size that will be considered a domain
    :return:
    a sorted dictionary with the position of the hit as the keys and ('hit id',score,model coverage percent)
    '''
    # initialize output
    gene_hits = SortedDict()
    redoFlag = True
    while redoFlag:
        if verbose: print("Sorting through intervals", pfam_hit_dict)
        redoFlag = False
        intervals_scores = [(key,value[1]) for key,value in pfam_hit_dict.items()]
        # sort intervals from pfam hits by score and place the highest score first
        intervals_scores.sort(key=itemgetter(1),reverse=True)
        # initialize intersect tree for quick overlap search
        intersectTree = IntervalTree()
        #add the intervals with the highest scores first
        for (interval,score) in intervals_scores:
            intervalStart = interval[0]
            intervalEnd = interval[1]
            intervalLength = intervalEnd-intervalStart+1
            # if the interval is less than the minimum domain size don't bother
            if intervalLength > minDomSize:
                intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)]
                overLapFlag = False
                # for every interval that you're adding resolve the overlapping intervals
                while len(intersectingIntervals) > 0 and intervalLength > 1:

                    start,end = intersectingIntervals[0]

                    # interval completely covers existing coverage, break up into two intervals and redo the process
                    if (intervalStart < start and intervalEnd > end):
                        if verbose: print("Split Interval", interval,intersectingIntervals, pfam_hit_dict[interval])
                        left_scale = calculate_window((intervalStart,start-1))/intervalLength
                        right_scale = calculate_window((end+1,intervalEnd))/intervalLength
                        pfam_hit_dict[(intervalStart,start-1)] = (pfam_hit_dict[interval][0],
                                                                  pfam_hit_dict[interval][1],
                                                                  pfam_hit_dict[interval][2] * left_scale)
                        pfam_hit_dict[(end+1,intervalEnd)] = (pfam_hit_dict[interval][0],
                                                              pfam_hit_dict[interval][1],
                                                              pfam_hit_dict[interval][2] * right_scale)
                        # delete original hit and iterate
                        del pfam_hit_dict[interval]
                        redoFlag = True
                        break
                    else:
                        #completely in the interval
                        if (intervalStart >= start and intervalEnd <= end):
                            #if completely overlapping then ignore since we already sorted by score
                            overLapFlag = True
                            break
                        #intersection covers the left hand side of the interval
                        elif intervalStart >= start:
                            intervalStart = end + 1
                        #intersection covers the right hand side of the interval
                        elif intervalEnd <= end:
                            intervalEnd = start - 1
                            # recalculate the interval length and see if there are still intersecting intervals
                        intervalLength = intervalEnd-intervalStart+1
                        intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)]

                if redoFlag:
                    if verbose: print("Exiting For Loop to Reinitialize",pfam_hit_dict)
                    break
                # if loop did not break because of an overlap add the annotation after resolving overlap,
                # check for minimum length after you merge intervals
                elif not overLapFlag and intervalLength > minDomSize:
                    if verbose: print("Adding Hit",(intervalStart,intervalEnd),pfam_hit_dict[interval][0])
                    # scale the hitCoverage based on the reduction this works since interval is a tuple and isn't mutated
                    hitCoverage = pfam_hit_dict[interval][2]*(intervalLength/(interval[1]-interval[0]+1.))
                    gene_hits[(intervalStart,intervalEnd)] = (pfam_hit_dict[interval][0],
                                                              pfam_hit_dict[interval][1],
                                                              hitCoverage)
                    intersectTree.add_interval(Interval(float(intervalStart),intervalEnd))
    if verbose: print("Merging Hits")
    # Merge Windows Right Next to one another that have the same pFam ID,
    # redoFlag: need to restart the process after a successful merge
    redoFlag = True
    while redoFlag:
        for idx in range(len(gene_hits)-1):
            left_hit = gene_hits.keys()[idx]
            right_hit = gene_hits.keys()[idx+1]
            left_window_size = calculate_window(left_hit)
            right_window_size = calculate_window(right_hit)
            merged_window_size = calculate_window((left_hit[0],right_hit[1]))
            new_coverage = (gene_hits[left_hit][2] + gene_hits[right_hit][2])*\
                           (left_window_size+ right_window_size)/merged_window_size
            # Will merge a hit under the following conditions:
            # 1. Gap between the two hits is less than the minimum domain
            # 2. Cumulative coverage of the two hits is less than 1 (this avoids merging repeats together)
            if right_hit[0]-left_hit[1] < minDomSize and gene_hits[left_hit][0] == gene_hits[right_hit][0] \
                    and new_coverage < 1:
                gene_hits[(left_hit[0],right_hit[1])] = (gene_hits[left_hit][0],
                                                         left_window_size/merged_window_size * gene_hits[left_hit][1] +
                                                         right_window_size/merged_window_size * gene_hits[right_hit][1],
                                                         new_coverage)
                redoFlag = True
                del gene_hits[left_hit]
                del gene_hits[right_hit]
                if verbose: print("Merged", left_hit,right_hit)
                break
        else:
            redoFlag = False
    if verbose: print("Deleting Domains Under Minimum Domain Size")
    # Finally check if any of the domains are less than the minimum domain size
    keysToDelete = [coordinates for coordinates in gene_hits.keys() if calculate_window(coordinates) < minDomSize]
    for key in keysToDelete:
        del gene_hits[key]
        if verbose: print("Deleting",key)
    if verbose: print("Final Annotation", gene_hits)
    return gene_hits
def add_new_locus(match, overlappers, chromo):
    intersecter = IntervalTree()
    locus = match.name2.split('|')[1]
    intersecter.insert(match.zstart1, match.end1, locus)
    overlappers[chromo][locus] = intersecter
    return overlappers