def __init__(self, chromosome, start, end=None, name=None, build=genomeUtils.hg19, strand=None, attributes={}):
     '''
     Create a genome feature that spans [start,stop) of chromosome in build (using 0-based BED coordinates)
     '''
     temp = genomeUtils.standardizeChromosome(chromosome,build)
     if temp == None:
         raise Exception('Invalid Chromosome: %s' % chromosome)
     chromosome = temp
     
     start,self.genomeStart = genomeUtils.standardizePosition(chromosome, start, build)
     if end == None:    # assume that the size is only 1 bp
         end = start+1
         self.genomeEnd = self.genomeStart+1
     else:
         end,self.genomeEnd = genomeUtils.standardizePosition(chromosome, end, build)
     
     self.hashCode = "%s_%i_%i" % (chromosome,start,end)
     
     if name==None:
         self.name = self.hashCode
     else:
         self.name = name
     
     self.build = build
     self.strand = strand
     self.attributes = attributes
     
     Interval.__init__(self, start=start, end=end, value=self.attributes, chrom=chromosome, strand=strand)
     
     # I use these objects for nested structures such as exons inside a gene, etc
     self.queryObjects = set()
     self.children = set()
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options):
    pos_range = defaultdict(lambda: [0,0])
    filtered_reads = Intersecter()
    read_iterator = filter_reads(
        bam_region, options.chrom_prefix + chrom,
        region_start, region_end, options
    )
    for read in read_iterator:
        if is_valid_paired(read, region_start, options):
            rstart = min(read.pos, read.pnext) + 1
            rend = rstart + abs(read.isize) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            inc_pr_at(pos_range, rstart, region_start, region_end)
            inc_pr_at(pos_range, rend, region_start, region_end)
        elif is_valid_single(read, options):
            rstart = read.pos + 1
            rend = rstart + aln_length(read.cigar) - 1
            filtered_reads.add_interval(Interval(rstart, rend))
            inc_pr(pos_range, rstart, rend, region_start, region_end)
            if as_merged(read, options) or as_trimmed(read, options):
                inc_pr_at(pos_range, rstart, region_start, region_end)
                inc_pr_at(pos_range, rend, region_start, region_end)
            elif read.is_reverse:
                inc_pr_at(pos_range, rend, region_start, region_end)
            else:
                inc_pr_at(pos_range, rstart, region_start, region_end)
    return filtered_reads, pos_range
Example #3
0
 def setUp(self):
     iv = IntervalNode(50, 59, Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv = iv.insert(f.start, f.end, f)
     self.intervals = iv
Example #4
0
 def setUp(self):
     iv = IntervalTree()
     iv.add_interval(Interval(50, 59))
     for i in range(0, 110, 10):
         if i == 50: continue
         f = Interval(i, i + 9)
         iv.add_interval(f)
     self.intervals = iv
Example #5
0
def regionTree2(tmp,resFrag,strand,info,geneid):
    if strand not in resFrag:
        resFrag[strand] = {}
    if tmp[0] in resFrag[strand]:
        resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
    else:
        resFrag[strand][tmp[0]] = Intersecter()
        resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
Example #6
0
    def test_left(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(60, n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        for i in range(10, 100, 10):
            r = iv.left(i, max_dist=10, n=1)
            self.assertEqual(r[0].end, i - 1)
Example #7
0
    def setUp(self):
        iv = IntervalNode(1, 2, Interval(1, 2))
        self.max = 1000000
        for i in range(0, self.max, 10):
            f = Interval(i, i)
            iv = iv.insert(f.start, f.end, f)

        for i in range(600):
            iv = iv.insert(0, 1, Interval(0, 1))
        self.intervals = iv
Example #8
0
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    '''
    This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara
    '''
    '''
    Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara 
    '''
    
    """
    Read a BED file and store the intervals in a tree
    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome
    in_file = input file [character]
    verbose = verbose mode [logical]
    """
    resFrag = {}
    if verbose:
        print("## Loading Restriction File Intervals '" + in_file + "'...")

    bed_handle = open(in_file)
    nline = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print("Warning : wrong input format in line" + nline + ". Not a BED file !?")
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        midPoint = (start + end)/2
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        if minfragsize != None and int(fragl) < int(minfragsize):
            print("Warning : fragment "+ name + " [" +  fragl + "] outside of range. Discarded")
            continue
        if maxfragsize != None and int(fragl) > int(maxfragsize):
            print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded")
            continue
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint}))
            resFrag[chromosome] = tree
    
    bed_handle.close()
    return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False):
    """
    Read a BED file and store the intervals in a tree

    Intervals are zero-based objects. The output object is a hash table with
    one search tree per chromosome

    in_file = input file [character]
    verbose = verbose mode [logical]

    """
    resFrag = {}
    if verbose:
        print "## Loading Restriction File Intervals '", in_file, "'..."

    bed_handle = open(in_file)
    nline = 0
    nfilt = 0
    for line in bed_handle:
        nline +=1
        bedtab = line.split("\t")
        try:
            chromosome, start, end, name = bedtab[:4]
        except ValueError:
            print "Warning : wrong input format in line", nline,". Not a BED file !?"
            continue

        # BED files are zero-based as Intervals objects
        start = int(start)  # + 1
        end = int(end)
        fragl = abs(end - start)
        name = name.strip()

        ## Discard fragments outside the size range
        filt=False
        if minfragsize != None and int(fragl) < int(minfragsize):
            nfilt+=1
            filt=True
        elif maxfragsize != None and int(fragl) > int(maxfragsize):
            nfilt+=1
            filt=True
       
        if chromosome in resFrag:
            tree = resFrag[chromosome]
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
        else:
            tree = Intersecter()
            tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt}))
            resFrag[chromosome] = tree
    
    if nfilt > 0:
        print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining."

    bed_handle.close()
    return resFrag
Example #10
0
    def test_downstream(self):
        iv = self.intervals
        downstreams = iv.downstream_of_interval(Interval(59, 60),
                                                num_intervals=200)
        for d in downstreams:
            self.assertTrue(d.start > 60)

        downstreams = iv.downstream_of_interval(Interval(59, 60, strand=-1),
                                                num_intervals=200)
        for d in downstreams:
            self.assertTrue(d.start < 59)
Example #11
0
def add_pvalues_to_peaks_frame_macs_bf(peaks_frame,experiment_peaks_frame,TTAA_frame,lam_win_size,pseudocounts = 0.2,macs_pvalue=True):
	print "lab specific hohoho"
	experiment_gnashy_dict = {}
	experiment_dict_of_trees = {}
	TTAA_frame_gbChr_dict = {} 
	TTAA_dict_of_trees = {}
	list_of_l_names = [lam_win_size]
	print "Making interval tree for experiment hops..."
	for name,group in experiment_peaks_frame.groupby('Chr'):
		experiment_gnashy_dict[name] = group
		experiment_gnashy_dict[name].index = experiment_gnashy_dict[name]["Start"]
		#initialize tree
		experiment_dict_of_trees[name] = Intersecter()
		#populate tree with position as interval
		for idx, row in experiment_gnashy_dict[name].iterrows():
			experiment_dict_of_trees[name].add_interval(Interval(int(idx),int(idx)+3)) 
	print "Making interval tree for TTAAs..."
	#make interval tree for TTAAs
	for name,group in TTAA_frame.groupby('Chr'): 
		TTAA_frame_gbChr_dict[name] = group
		TTAA_frame_gbChr_dict[name].index = TTAA_frame_gbChr_dict[name]["Start"]
		#initialize tree
		TTAA_dict_of_trees[name] = Intersecter() 
		#populate tree with position as interval
		for idx, row in TTAA_frame_gbChr_dict[name].iterrows(): 
			TTAA_dict_of_trees[name].add_interval(Interval(int(idx),int(idx+3)))
	#go through cluster frame and compute pvalues 
	lambda_type_list =[]
	lambda_list = []
	pvalue_list = []
	for idx,row in peaks_frame.iterrows():
		#add number of background hops in cluster to frame
		cluster_center = row["Center"]
		#find lambda and compute significance of cluster
		num_TTAAs = len(TTAA_dict_of_trees[row["Chr"]].find(row["Start"],row["End"]))
		#compute lambda for window size
		num_exp_hops_lam_win_size = len(experiment_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2))
		num_TTAAs_lam_win_size = len(TTAA_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2))
		lambda_win_size = float(num_exp_hops_lam_win_size)/(max(num_TTAAs_lam_win_size,1))

		lambda_f = lambda_win_size
		lambda_type_list.append(lam_win_size)
		lambda_list.append(lambda_f)
		#compute pvalue and record it
		pvalue = 1-scistat.poisson.cdf((row["Experiment Hops"]+pseudocounts),lambda_f*max(num_TTAAs,1)+pseudocounts)
		pvalue_list.append(pvalue)

						 
	#make frame from all of the lists 
	peaks_frame["Lambda Type"] = lambda_type_list
	peaks_frame["Lambda"] = lambda_list
	peaks_frame["Poisson pvalue"] = pvalue_list
	return peaks_frame
Example #12
0
def load_exons_and_genes(genesF):
    gtrees={}
    etrees={}
    genesIN = open(genesF)
    #map transcript (isoform) name to cluster_id ("gene")
    t_to_gene_map = {}
    #load individual transcripts (isoforms)
    for line in genesIN:
        #skip header
        if line[0] == 'c':
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (cluster_id,tname,refid,strand,tstart,tend) = fields[:6]
        refid = refid.replace("chr","")
        eStarts = fields[9].split(',')
        eEnds = fields[10].split(',')
        alignID = fields[12]
        #now save the exons as intervals
        if refid not in etrees:
            etrees[refid] = IntervalTree()
        #use 1-based closed interval
        tstart=int(tstart)+1 
        for (eStart,eEnd) in zip(eStarts,eEnds):
            if len(eStart) == 0:
                continue
            #use 1-based closed interval
            eStart=int(eStart)+1
            #sys.stderr.write("%s %s %s\n"%(eStart,eEnd,cluster_id))
            #must adjust for the open intervals (both) of the interval tree
            itv = Interval(eStart-1,int(eEnd)+1, value=[cluster_id,alignID,strand])
            etrees[refid].insert_interval(itv)
        #now map to the cluster_id and figure whether we can increase
        #the longest transcript coordinate span with these coordinates
        tend = int(tend)
        if cluster_id not in t_to_gene_map:
            t_to_gene_map[cluster_id]=[tstart,tend,refid]
        if tstart < t_to_gene_map[cluster_id][0]:
            t_to_gene_map[cluster_id][0] = tstart
        if tend > t_to_gene_map[cluster_id][1]:
            t_to_gene_map[cluster_id][1] = tend
    genesIN.close()
    #now convert the cluster (gene) coordinate extents to intervals
    for (cluster_id,span) in t_to_gene_map.iteritems():
        (st,en,refid) = span
        if refid not in gtrees:
            gtrees[refid] = IntervalTree()
        #sys.stderr.write("%d %d %s\n"%(st,en,cluster_id))
        #must adjust for the open intervals (both) of the interval tree
        itv = Interval(int(st)-1,int(en)+1,value=cluster_id)
        gtrees[refid].insert_interval(itv)
    return (etrees,gtrees)
Example #13
0
 def add_exon(self, rStart0, rEnd1, sStart0, sEnd1, rstrand, score):
     assert rStart0 < rEnd1 and sStart0 < sEnd1
     if rstrand == '-':
         assert len(self.ref_exons) == 0 or self.ref_exons[0].start >= rEnd1
         self.scores.insert(0, score)
         self.ref_exons.insert(0, Interval(rStart0, rEnd1))
     else:
         assert len(self.ref_exons) == 0 or self.ref_exons[-1].end <= rStart0
         self.scores.append(score)
         self.ref_exons.append(Interval(rStart0, rEnd1))
     if rstrand == '-':
         self.seq_exons.insert(0, Interval(sStart0, sEnd1))
     else:
         self.seq_exons.append(Interval(sStart0, sEnd1))
Example #14
0
def BED_to_interval_tree(BED_file):
    """
    Creates an index of intervals, using an interval tree, for each BED entry

    :param BED_file: file handler of a BED file

    :return interval tree
    """
    from bx.intervals.intersection import IntervalTree, Interval

    bed_interval_tree = {}
    for line in BED_file:
        if line[0] == "#":
            continue
        fields = line.strip().split()
        chrom, start_bed, end_bed, = fields[0], int(fields[1]), int(fields[2])

        if chrom not in bed_interval_tree:
            bed_interval_tree[chrom] = IntervalTree()

        # skip if a region overlaps with a region already seen
        """
        if len(bed_interval_tree[chrom].find(start_bed, start_bed + 1)) > 0:
            continue
        """
        bed_interval_tree[chrom].add_interval(Interval(start_bed, end_bed))

    return bed_interval_tree
Example #15
0
def convert_BLAST9rec_to_gmapRecord(rec_list):
    """
    Adds .chr, .seqid, and .ref_exons so we can use it to write in UCSC format
    """
    if not len(rec_list) > 0:
        raise RuntimeError("Cannot convert an empty record list!")
    seqname = rec_list[0].sID
    seqid = rec_list[0].qID
    strand = rec_list[0].strand

    if not all(x.sID == chr for x in rec_list):
        raise RuntimeError(
            "The record list has differing `sID` valuess - they must all be the same!!"
        )
    if not all(x.qID == seqid for x in rec_list):
        raise RuntimeError(
            "The record list has differing `qID` valuess - they must all be the same!"
        )
    if not all(x.strand == strand for x in rec_list):
        raise RuntimeError(
            "The record list has differing `strand` values - they must all be the same!!"
        )

    r = gmapRecord(seqname, coverage=0, identity=0, strand=strand, seqid=seqid)
    r.ref_exons = [Interval(x.sStart, x.sEnd) for x in rec_list]

    return r
Example #16
0
def load_bed(read_length, path, flanking):
    """
    parse bed file and ignore overlapping regions
    :param path: BED file path
    :param flanking: Integer to add to each target's start and end
    :return: dictionary of targets where keys are chrom,start,end
    """
    targets = {}
    unique_targets = []
    bed = BedTool(path)
    for record in bed:
        chrom, start, end = record[0], int(record[1]), int(record[2])
        start -= flanking
        end += flanking
        if int(end) < int(
                start):  # if this target on minus strand flip the start/end
            start, end = end, start
        if chrom not in targets:
            targets[chrom] = Intersecter()
        if targets[chrom].find(start, end) == [] and abs(start - end) >= (
                read_length * 2):  # no overlaps yet
            targets[chrom].add_interval(Interval(start, end))
            unique_targets.append([chrom, start, end])

    return unique_targets
Example #17
0
def parse_models(bedfile):
    '''Converts gene models in BED format to a list of exon intervals.'''

    reader = csv.reader(open(bedfile), dialect='excel-tab')
    for row in reader:
        exons = []
        chrom = row[0]
        chrom_start = int(row[1])
        geneid = row[3]

        exon_sizes = [int(s) for s in row[10].split(',')]
        exon_starts = [chrom_start + int(s) for s in row[11].split(',')]

        for i in range(len(exon_starts)):
            if i == 0:
                terminal = True
            elif i == len(exon_starts) - 1:
                terminal = True
            else:
                terminal = False

            exon_start = exon_starts[i]
            exon_end = exon_start + exon_sizes[i]

            exon = Interval(exon_start, exon_end, value={'geneid':geneid,
                                                'terminal':terminal,
                                                'chrom':chrom})
            exons.append(exon)

        yield exons
Example #18
0
def load_repeats(repeatsF):
    rtrees={}
    gtype = ""
    tname = ""
    seen = set()
    repeatsIN = open(repeatsF)
    FIRST = True
    for line in repeatsIN:
        #skip header
        if FIRST:
            FIRST = False
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (refid,st,en) = fields[5:8]
        refid = refid.replace("chr","")
        strand = fields[9]
        tname = fields[10]
        gtype = fields[11]
        #use 1-based closed interval
        st=int(st)+1
        #present already in interval tree
        if "%d_%s" % (st,en) in seen:
            continue
        seen.add("%d_%s" % (st,en))
        if refid not in rtrees:
            rtrees[refid] = IntervalTree()
        #must adjust for the open intervals (both) of the interval tree
        itv = Interval(st-1,int(en)+1, value=[tname,gtype,strand])
        rtrees[refid].insert_interval(itv)
    repeatsIN.close()
    return rtrees
Example #19
0
def load_repeats(repeatsF):
    rtrees = {}
    gtype = ""
    tname = ""
    seen = set()
    repeatsIN = open(repeatsF)
    FIRST = True
    for line in repeatsIN:
        #skip header
        if FIRST:
            FIRST = False
            continue
        line = line.rstrip()
        fields = line.split('\t')
        (refid, st, en) = fields[5:8]
        orient = fields[9]
        tname = fields[10]
        gtype = fields[11]

        #present already in interval tree
        if "%s_%s" % (st, en) in seen:
            continue
        seen.add("%s_%s" % (st, en))
        if refid not in rtrees:
            rtrees[refid] = IntervalTree()
        itv = Interval(int(st), int(en), value=[tname, gtype])
        rtrees[refid].insert_interval(itv)
    repeatsIN.close()
    return rtrees
Example #20
0
def find_overlap_dataframes(query, hits):
    '''
    find overlap between sorted query and hits regions
    :param query: [pd.DataFrame] query peaks
    :param hits: [pd.DataFrame] hits regions
    :return: flags [array]
     
    '''
    query_idx, hits_labs = [], []
    tree_hash = {chrom: Intersecter() for chrom in hits.chrom.unique()}
    null_res = [
        tree_hash[chrom].add_interval(Interval(start, end))
        for chrom, start, end in hits.values
    ]

    for idx, line in enumerate(query.values):
        chrom, start, end = line[0:3]
        overlaps = tree_hash[chrom].find(start, end)
        if not overlaps: continue
        for ovp in overlaps:
            hits_labs.append(chrom + ':' + str(ovp.start) + '-' + str(ovp.end))
            query_idx.append(idx)

    tmp_labels = hits.chrom + ':' + hits.start.astype(
        str) + '-' + hits.end.astype(str)
    hit_indexs = tmp_labels[tmp_labels.isin(hits_labs)].index.values
    return np.array(query_idx), hit_indexs
Example #21
0
def count_reads(transcripts, bam_iter, number_of_counts=1):
    """ Count the reads in a given transcript

    :TODO rename 
    :TODO change to cython
    Arguments
    ---------
    transcripts : list
        list of exons
    bam_iter : pysam.BamFileIterator
        gotton after pysam.BamFile.fetch() call
    """
    # Convert this to Cython
    out_counts = zeros(len(transcripts))
    intron_lengths = []
    read_vector = []
    tree = Intersecter()
    # Assume exons are position sorted
    for ti, transcript in enumerate(transcripts):
        ex_list = []
        for j, i in enumerate(transcript):
            tree.add_interval(
                Interval(int(i[0]), int(i[1]), value={'anno': ti}))
            if j != 0:
                ex_list.append(transcript[j-1][1]\
                        - transcript[j][0])
        intron_lengths.append(ex_list)
    for read in bam_iter:
        block_counter = zeros((len(transcripts), ))
        intron_match = zeros((len(transcripts), ))
        blocks = read.get_blocks()
        junction_lengths = []
        for i, j in enumerate(blocks):
            if i != 0:
                junction_lengths.append(blocks[i - 1][1] - j[0])
            else:
                pass
        junction_lengths = set(junction_lengths)
        for i, k in enumerate(blocks):
            overlap = tree.find(k[0], k[1])
            if len(overlap) == 0:
                break
            else:
                for s in overlap:
                    if (k[0] >= s.start) and\
                            (k[1] <= s.end):
                        block_counter[s.value['anno']] += 1
        for ij, il in enumerate(intron_lengths):
            if set(junction_lengths).issubset(set(il)):
                intron_match[ij] = 1
            else:
                pass
        smatch = nrepeat(len(blocks), len(transcripts))
        gg = logical_and(block_counter == smatch, intron_match)
        read_vector.append(gg)
        out_counts += gg
    read_matrix = array(read_vector)
    uniq_r = sum_(read_matrix, axis=1) == 1
    #normalization_constant = [for i in transcripts]
    return (out_counts)
Example #22
0
def make_uuid_sets(tables):
    ''' UUID interval tree --> UUID intersection graph --> connected components --> UUID sets'''
    forest = dd(Intersecter)

    for table in tables:
        for rec in getrec(table):
            chrom = rec['Chromosome']
            start = int(rec['5_Prime_End']) - 10
            end = int(rec['3_Prime_End']) + 10

            if start > end: start, end = end, start

            forest[chrom].add_interval(Interval(start, end, value=rec['UUID']))

    G = nx.Graph()

    for table in tables:
        for rec in getrec(table):
            chrom = rec['Chromosome']
            start = int(rec['5_Prime_End']) - 10
            end = int(rec['3_Prime_End']) + 10

            if start > end: start, end = end, start

            for i in forest[chrom].find(start, end):
                G.add_edge(rec['UUID'], i.value)

    return list(nx.connected_components(G))
Example #23
0
    def test_upstream(self):
        iv = self.intervals
        upstreams = iv.upstream_of_interval(Interval(59, 60),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.end < 59)

        upstreams = iv.upstream_of_interval(Interval(60, 70, strand=-1),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.start > 70)

        upstreams = iv.upstream_of_interval(Interval(58, 58, strand=-1),
                                            num_intervals=200)
        for u in upstreams:
            self.assertTrue(u.start > 59)
 def after(self, contig, start, end, num_intervals=1, max_dist=2500):
     '''get closest interval after *end*.'''
     if contig not in self.mIndex:
         raise KeyError("contig %s not in index" % contig)
     return [(x.start, x.end, x.value)
             for x in self.mIndex[contig].after_interval(
                 Interval(start, end), num_intervals=1, max_dist=max_dist)]
Example #25
0
def regionTree(tmp, resFrag):
    """
    tmp: The length of the list is at least 3(chrom,start,end)
    resFrag: a dictionary to store the Interval tree
    """
    if tmp[0] not in resFrag:
        resFrag[tmp[0]] = Intersecter()
    resFrag[tmp[0]].add_interval(Interval(int(tmp[1]), int(tmp[2]), tmp[3:]))
Example #26
0
    def __init__(self,coordinates):
        self.interval_tree=dict()

        for c in coordinates:
            if c.chr_id not in self.interval_tree:
                self.interval_tree[c.chr_id]=Intersecter()

            self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend,c))
Example #27
0
def parse_gene_coordinate(infile):
    for line in open(infile):
        cols = line.strip().split(',')
        geneid = cols[0]
        chr, start, end = cols[2:]
        start = int(start)
        end = int(end)
        yield chr, Interval(start, end, value={'geneid': geneid})
Example #28
0
def init_intersecter(hits):
    ''' '''

    intersecter = Intersecter()

    for h in hits:
        intersecter.add_interval(Interval(h[0], h[1]))

    return intersecter
Example #29
0
    def setUp(self):

        iv = IntervalTree()
        n = 0
        for i in range(1, 1000, 80):
            iv.insert(i, i + 10, dict(value=i * i))
            # add is synonym for insert.
            iv.add(i + 20, i + 30, dict(astr=str(i * i)))

            # or insert/add an interval object with start, end attrs.
            iv.insert_interval(
                Interval(i + 40, i + 50, value=dict(astr=str(i * i))))
            iv.add_interval(
                Interval(i + 60, i + 70, value=dict(astr=str(i * i))))

            n += 4
        self.intervals = self.iv = iv
        self.nintervals = n
Example #30
0
    def test_right(self):
        iv = self.intervals
        self.assertEqual(str(iv.left(60, n=2)),
                         str([Interval(50, 59),
                              Interval(40, 49)]))

        def get_right_start(b10):
            r = iv.right(b10 + 1, n=1)
            assert len(r) == 1
            return r[0].start

        for i in range(10, 100, 10):
            self.assertEqual(get_right_start(i), i + 10)

        for i in range(0, 100, 10):
            r = iv.right(i - 1, max_dist=10, n=1)
            print r
            self.assertEqual(r[0].start, i)
Example #31
0
    def test_n(self):
        iv = self.intervals
        for i in range(0, 90, 10):
            r = iv.after(i, max_dist=20, num_intervals=2)
            self.assertEqual(r[0].start, i + 10)
            self.assertEqual(r[1].start, i + 20)

            r = iv.after_interval(Interval(i, i), max_dist=20, num_intervals=2)
            self.assertEqual(r[0].start, i + 10)
            self.assertEqual(r[1].start, i + 20)