def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval( Interval(int(i[0]), int(i[1]), value={'anno': ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts), )) intron_match = zeros((len(transcripts), )) blocks = read.get_blocks() junction_lengths = [] for i, j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return (out_counts)
def buildIntervalTree(exons): '''Build interval tree from exon annotations.''' tree = Intersecter() for exon in exons: tree.add_interval(Interval(exon.start, exon.end, value={'cStart': exon.cStart, 'cEnd': exon.cEnd})) return tree
def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts),)) intron_match = zeros((len(transcripts),)) blocks = read.get_blocks() junction_lengths = [] for i,j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return(out_counts)
def init_intersecter(hits): ''' ''' intersecter = Intersecter() for h in hits: intersecter.add_interval(Interval(h[0], h[1])) return intersecter
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) nline = 0 nfilt = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range filt=False if minfragsize != None and int(fragl) < int(minfragsize): nfilt+=1 filt=True elif maxfragsize != None and int(fragl) > int(maxfragsize): nfilt+=1 filt=True if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) resFrag[chromosome] = tree if nfilt > 0: print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." bed_handle.close() return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): ''' This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' ''' Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print("## Loading Restriction File Intervals '" + in_file + "'...") bed_handle = open(in_file) nline = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print("Warning : wrong input format in line" + nline + ". Not a BED file !?") continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) midPoint = (start + end)/2 fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range if minfragsize != None and int(fragl) < int(minfragsize): print("Warning : fragment "+ name + " [" + fragl + "] outside of range. Discarded") continue if maxfragsize != None and int(fragl) > int(maxfragsize): print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded") continue if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def add_pvalues_to_peaks_frame_macs_bf(peaks_frame,experiment_peaks_frame,TTAA_frame,lam_win_size,pseudocounts = 0.2,macs_pvalue=True): print "lab specific hohoho" experiment_gnashy_dict = {} experiment_dict_of_trees = {} TTAA_frame_gbChr_dict = {} TTAA_dict_of_trees = {} list_of_l_names = [lam_win_size] print "Making interval tree for experiment hops..." for name,group in experiment_peaks_frame.groupby('Chr'): experiment_gnashy_dict[name] = group experiment_gnashy_dict[name].index = experiment_gnashy_dict[name]["Start"] #initialize tree experiment_dict_of_trees[name] = Intersecter() #populate tree with position as interval for idx, row in experiment_gnashy_dict[name].iterrows(): experiment_dict_of_trees[name].add_interval(Interval(int(idx),int(idx)+3)) print "Making interval tree for TTAAs..." #make interval tree for TTAAs for name,group in TTAA_frame.groupby('Chr'): TTAA_frame_gbChr_dict[name] = group TTAA_frame_gbChr_dict[name].index = TTAA_frame_gbChr_dict[name]["Start"] #initialize tree TTAA_dict_of_trees[name] = Intersecter() #populate tree with position as interval for idx, row in TTAA_frame_gbChr_dict[name].iterrows(): TTAA_dict_of_trees[name].add_interval(Interval(int(idx),int(idx+3))) #go through cluster frame and compute pvalues lambda_type_list =[] lambda_list = [] pvalue_list = [] for idx,row in peaks_frame.iterrows(): #add number of background hops in cluster to frame cluster_center = row["Center"] #find lambda and compute significance of cluster num_TTAAs = len(TTAA_dict_of_trees[row["Chr"]].find(row["Start"],row["End"])) #compute lambda for window size num_exp_hops_lam_win_size = len(experiment_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2)) num_TTAAs_lam_win_size = len(TTAA_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2)) lambda_win_size = float(num_exp_hops_lam_win_size)/(max(num_TTAAs_lam_win_size,1)) lambda_f = lambda_win_size lambda_type_list.append(lam_win_size) lambda_list.append(lambda_f) #compute pvalue and record it pvalue = 1-scistat.poisson.cdf((row["Experiment Hops"]+pseudocounts),lambda_f*max(num_TTAAs,1)+pseudocounts) pvalue_list.append(pvalue) #make frame from all of the lists peaks_frame["Lambda Type"] = lambda_type_list peaks_frame["Lambda"] = lambda_list peaks_frame["Poisson pvalue"] = pvalue_list return peaks_frame
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) nline = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range if minfragsize != None and int(fragl) < int(minfragsize): print "Warning : fragment ", name, " [", fragl, "] outside of range. Discarded" continue if maxfragsize != None and int(fragl) > int(maxfragsize): print "Warning : fragment ", name, " [", fragl,"] outside of range. Discarded" continue if chromosome in resFrag.keys(): tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def find_overlap_dataframes(query, hits): ''' find overlap between sorted query and hits regions :param query: [pd.DataFrame] query peaks :param hits: [pd.DataFrame] hits regions :return: flags [array] ''' query_idx, hits_labs = [], [] tree_hash = {chrom: Intersecter() for chrom in hits.chrom.unique()} null_res = [ tree_hash[chrom].add_interval(Interval(start, end)) for chrom, start, end in hits.values ] for idx, line in enumerate(query.values): chrom, start, end = line[0:3] overlaps = tree_hash[chrom].find(start, end) if not overlaps: continue for ovp in overlaps: hits_labs.append(chrom + ':' + str(ovp.start) + '-' + str(ovp.end)) query_idx.append(idx) tmp_labels = hits.chrom + ':' + hits.start.astype( str) + '-' + hits.end.astype(str) hit_indexs = tmp_labels[tmp_labels.isin(hits_labs)].index.values return np.array(query_idx), hit_indexs
def load_bed(read_length, path, flanking): """ parse bed file and ignore overlapping regions :param path: BED file path :param flanking: Integer to add to each target's start and end :return: dictionary of targets where keys are chrom,start,end """ targets = {} unique_targets = [] bed = BedTool(path) for record in bed: chrom, start, end = record[0], int(record[1]), int(record[2]) start -= flanking end += flanking if int(end) < int( start): # if this target on minus strand flip the start/end start, end = end, start if chrom not in targets: targets[chrom] = Intersecter() if targets[chrom].find(start, end) == [] and abs(start - end) >= ( read_length * 2): # no overlaps yet targets[chrom].add_interval(Interval(start, end)) unique_targets.append([chrom, start, end]) return unique_targets
def regionTree2(tmp,resFrag,strand,info,geneid): if strand not in resFrag: resFrag[strand] = {} if tmp[0] in resFrag[strand]: resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid})) else: resFrag[strand][tmp[0]] = Intersecter() resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
def regionTree(tmp, resFrag): """ tmp: The length of the list is at least 3(chrom,start,end) resFrag: a dictionary to store the Interval tree """ if tmp[0] not in resFrag: resFrag[tmp[0]] = Intersecter() resFrag[tmp[0]].add_interval(Interval(int(tmp[1]), int(tmp[2]), tmp[3:]))
def __init__(self,coordinates): self.interval_tree=dict() for c in coordinates: if c.chr_id not in self.interval_tree: self.interval_tree[c.chr_id]=Intersecter() self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend,c))
def maps_gene(mapped): '''Determine if the mapped alignment falls within a gene.''' global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [intersecter.add_interval(Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
def load_BED(in_file, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome BED file are half-open, meaning that a bin ]100, 200] covered the bases 101 to 200 in_file = input file [character] verbose = verbose mode [logical] """ x = {} if verbose: print "## Loading BED file '", in_file, "'..." featureNames = [] nline = 0 with open(in_file) as bed_handle: for line in bed_handle: if nline > 0 and nline % 5000 == 0 and verbose: print "## %d features loaded ..." % nline nline += 1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based, half-open as Intervals objects start = int(start) end = int(end) featureNames.append(name.strip()) if chromosome in x: tree = x[chromosome] tree.add_interval( Interval(start, end, value={'pos': nline - 1})) else: tree = Intersecter() tree.add_interval( Interval(start, end, value={'pos': nline - 1})) x[chromosome] = tree bed_handle.close() return (x, featureNames)
def load_bed(in_file, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ intervals = {} if verbose: print >> sys.stderr, "## Loading BED file '", in_file, "'..." bed_handle = open(in_file) nline = 0 for line in bed_handle: nline += 1 bedtab = line.strip().split("\t") try: chromosome, start, end = bedtab[:3] except ValueError: print >> sys.stderr, "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) if chromosome in intervals: tree = intervals[chromosome] tree.add_interval(Interval(start, end)) else: tree = Intersecter() tree.add_interval(Interval(start, end)) intervals[chromosome] = tree bed_handle.close() return intervals
def maps_gene(mapped): """Determine if the mapped alignment falls within a gene.""" global intersecters try: intersecter = intersecters[mapped['genome']] except KeyError: genes = db.feature.find({'genome': mapped['genome'], 'type': 'gene'}) intersecter = Intersecter() # Interval end is exclusive, need to +1 to line up with actual position [ intersecter.add_interval( Interval(gene['start'], gene['end'] + 1, gene['uid'])) for gene in genes ] intersecters[mapped['genome']] = intersecter return intersecter.find(mapped['refStart'], mapped['refEnd'])
def load_BED(in_file, exclusionSize=0, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ x = {} x_ex = {} if verbose: print "## Loading BED file '", in_file, "'..." nline = 0 with open(in_file) as bed_handle: for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in x.keys(): tree = x[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) x[chromosome] = tree ## Exclusion regions if exclusionSize > 0: if chromosome in x_ex.keys(): tree_ex = x_ex[chromosome] tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) else: tree_ex = Intersecter() tree_ex.add_interval(Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval(Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) x_ex[chromosome] = tree_ex bed_handle.close() return (x, x_ex)
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options): pos_range = defaultdict(lambda: [0,0]) filtered_reads = Intersecter() read_iterator = filter_reads( bam_region, options.chrom_prefix + chrom, region_start, region_end, options ) for read in read_iterator: if is_valid_paired(read, region_start, options): rstart = min(read.pos, read.pnext) + 1 rend = rstart + abs(read.isize) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif is_valid_single(read, options): rstart = read.pos + 1 rend = rstart + aln_length(read.cigar) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) if as_merged(read, options) or as_trimmed(read, options): inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif read.is_reverse: inc_pr_at(pos_range, rend, region_start, region_end) else: inc_pr_at(pos_range, rstart, region_start, region_end) return filtered_reads, pos_range
def _index_record_in_intervaldict(self, rec): ''' Insert CDS intervals from a chromosome SeqRecord into our cached interval tree dict. ''' intersector = self.intervaldict.setdefault(rec.id, Intersecter()) for feature in rec.features: # Each top-level CDS intvl = dict(chrom=rec.id, value={'cdsobj': feature}) if hasattr(feature, 'strand'): intvl['strand'] = feature.strand # could still be None. locus = Interval(int(feature.location.start), int(feature.location.end), **intvl) intersector.add_interval(locus)
def purify_introns(Introns_Dict, Exons_Sect): ''' select only introns that don't contain exons, these are likely to be "pure" introns that lack any real polyA sites. ''' pure = dict() for (chrm, strand) in Introns_Dict.keys(): for intron in Introns_Dict[(chrm, strand)]: if not Exons_Sect[(chrm, strand)].find(intron[0], intron[1]): if not pure.has_key((chrm, strand)): pure[(chrm, strand)] = Intersecter() pure[(chrm, strand)].add(intron[0], intron[1], intron) return pure
def calculate_score(chrom, start, end,ctype="WPS"): filteredReads = Intersecter() posRange = defaultdict(int) for read in readIterator(args,chrom,start,end): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (read.is_read2 and read.pnext+read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random() >= options.downsample: continue rstart = min(read.pos,read.pnext)+1 # 1-based rend = rstart+abs(read.isize)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 else: if options.downsample != None and random.random() >= options.downsample: continue rstart = read.pos+1 # 1-based rend = rstart+aln_length(read.cigar)-1 # end included rlength = rend-rstart+1 if options.minLength <= rlength <= options.maxLength: filteredReads.add_interval(Interval(rstart,rend)) if ctype == "COV": for i in range(rstart,rend+1): if i >= start and i <= end: posRange[i]+=1 elif ctype == "STARTS": if rstart >= start and rstart <= end: posRange[rstart]+=1 if rend >= start and rend <= end: posRange[rend]+=1 if ctype == "WPS": protection = options.protection//2 for pos in xrange(start,end+1): rstart,rend = pos-protection,pos+protection gcount,bcount = 0,0 for read in filteredReads.find(rstart,rend): if (read.start > rstart) or (read.end < rend): bcount +=1 else: gcount +=1 posRange[pos]+=gcount-bcount res = [] for pos in xrange(start,end+1): res.append(posRange[pos]) return res
def GenomicInterval_2_intersecter_and_dict(GI): II = dict() ID = dict() for intron in GI: chrm = clean_chr_name(intron.chr) strand = intron.strand if not II.has_key((chrm, strand)): II[(chrm, strand)] = Intersecter() II[(chrm, strand)].add(intron.start, intron.stop, [intron.start, intron.stop]) if not ID.has_key((chrm, intron.strand)): ID[(chrm, strand)] = [] ID[(chrm, strand)].append([intron.start, intron.stop]) return II, ID
def polyA_dict_2_intersecter( polyA ): ''' load a polyA gff file into a dictionary/intersecter object NOTE: bx.python is (open,open) in its interval searchers, e.g. T.find( 1,10 ) will not return true if T contains (1,1) or (10,10) ''' polyA_I = dict() for (chrm, strand) in polyA.keys(): if not polyA_I.has_key((chrm,strand)): polyA_I[(chrm,strand)] = Intersecter() for p_site in polyA[(chrm,strand)]: polyA_I[(chrm,strand)].add( p_site, p_site, [p_site, polyA[(chrm,strand)][p_site]] ) return polyA_I
def load_restriction_fragment(in_file, verbose): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) for line in bed_handle: bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: # FIXME we might want a proper warning message here ! continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in resFrag.keys(): tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def index_bed(genes, un_stranded): """ """ chroms = set([i['chrom'] for i in genes]) if not un_stranded: bed_quicksect = { chr + ":" + str: Intersecter() for str in ['+', '-'] for chr in set(chroms) } else: bed_quicksect = {chr: Intersecter() for chr in set(chroms)} for annotation in genes: chrom = annotation['chrom'] strand = annotation['strand'] contig = chrom if un_stranded else chrom + ":" + strand bed_quicksect[contig].insert_interval( Interval(annotation['beg'], annotation['end'], chrom=chrom, strand=strand, value={ "annotation_type": annotation['annotation_type'], "annotation_name": annotation['annotation_name'], })) return bed_quicksect
def gtf_CDSs_2_intersecter_and_dict(gtf_fn): fid = open(gtf_fname) CDS_I = dict() CDS_D = dict() for line in fid: data = line.strip().split('\t') if not data[2] == 'CDS': continue chrm = clean_chr_name(data[0]) strand = data[6] start = int(data[3]) end = int(data[4]) if not CDS_I.has_key((chrm, strand)): CDS_I[(chrm, strand)] = Intersecter() CDS_I[(chrm, strand)].add(start, end, [start, end]) if not CDS_D.has_key((chrm, strand)): CDS_D[(chrm, strand)] = [] CDS_D[(chrm, strand)].append([start, end]) return CDS_I, CDS_D
def emf_init_index(filename): from bx.intervals.intersection import Intersecter, Interval if os.path.isfile(filename): infile = open(filename) res = {} for line in infile: if line.startswith("#"): continue fields = line.split() #Chr\tStart\tEnd\tFile\tByte\tLine if len(fields) == 6: chrom,start,end,cfile,bytes,lines = fields start,end = int(start)+1,int(end)+1 #MAKE COORDINATES ONE BASED HALF-OPEN if chrom not in res: res[chrom] = {} res[chrom]["coords"] = Intersecter() res[chrom]["coords"].add_interval(Interval(start, end)) res[chrom][(start,end)] = cfile,int(bytes),int(lines) else: sys.stderr.write("Unexpected line [%s]: %s"%(filename,line)) infile.close() return res return None
def calculate_intersection(cls,coords1,coords2,build_matrix=False): coords_in_common=set() intersection_indexes=set() if build_matrix: intersection_matrix=sparse_matrix((len(coords1),len(coords2))) coord_to_row_index=dict() row_index=0 interval_tree=dict() #Build the interval tree on the first set of coordinates for c in coords1: if c.chr_id not in interval_tree: interval_tree[c.chr_id]=Intersecter() interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend)) coord_to_row_index[c]=row_index row_index+=1 #Calculating the intersection #for each coordinates on the second set check intersection and fill the matrix for cl_index,c in enumerate(coords2): if interval_tree.has_key(c.chr_id): coords_hits=interval_tree[c.chr_id].find(c.bpstart, c.bpend) #coords_in_common+=coords_hits for coord_hit in coords_hits: c_to_add=Coordinate.coordinates_from_interval(c.chr_id, coord_hit) coords_in_common.add(c_to_add) row_index=coord_to_row_index[c_to_add] intersection_indexes.add(row_index) if build_matrix: intersection_matrix[row_index,cl_index]+=1 if build_matrix: return list(coords_in_common),list(intersection_indexes),intersection_matrix else: return list(coords_in_common),list(intersection_indexes)
def annotate(self): #allocate_memory self.annotation_track=np.ones(len(self.input_coordinates),dtype=np.int) self.interval_tree=dict() self.coord_to_row_index=dict() self.row_index=0 #Build the interval Tree for c in self.input_coordinates: if c.chr_id not in self.interval_tree: self.interval_tree[c.chr_id]=Intersecter() self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend)) self.coord_to_row_index[c]=self.row_index self.row_index+=1 for idx,bed_filename in enumerate(self.annotations_filenames): coordinates=Coordinate.bed_to_coordinates(bed_filename) prime_number=self.annotation_names_to_prime[self.annotation_names[idx]] for idx_intersection in self.__intersection_indexes(coordinates): self.annotation_track[idx_intersection]*=prime_number
def load_restriction_fragment(enzyme_bed): """ Read enzyme bed file and store the intervals in a tree :param enzyme_bed: bed file path :return: res_frag """ res_frag = defaultdict(lambda :Intersecter()) with open(enzyme_bed) as fp: for nline, line in enumerate(fp, 1): line_list = line.strip().split() try: chrn, start, end, name = line_list[:4] except ValueError: print("[Warning] wrong input format in line :{}. " "Not a bed file?!".format(nline)) continue start = int(start) end = int(end) tree = res_frag[chrn] tree.add_interval(Interval(start, end, value={'name':name})) return res_frag
# fields three and six are set to default value '-' # field 8 is set to default value '.' to meet compatibility with GFF3 f1.columns = [ 'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ] f2 = pd.read_csv(sys.argv[2], header=None, sep="\t") f2.columns = [ 'chr', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes' ] tree_dict = {} tree = Intersecter() # first input file - gvf file containing the annotation for chrom in range(1, 23): tree_dict['chr' + str(chrom)] = Intersecter() tree_dict['chrX'] = Intersecter() tree_dict['chrY'] = Intersecter() gene_id_re = re.compile("(GeneID=[0-9]+)") gene_symbol = re.compile("(GeneSymbol=[A-z0-9]+)") for i, j in f1.iterrows(): geneid = gene_id_re.search(j[-1]).group(0) geneid_new = geneid.replace("GeneID=", "") genesymbol = gene_symbol.search(j[-1]).group(0)
def load_BED(in_file, exclusionSize=0, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ x = {} x_ex = {} if verbose: print "## Loading BED file '", in_file, "'..." nline = 0 with open(in_file) as bed_handle: for line in bed_handle: nline += 1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline, ". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) name = name.strip() if chromosome in x: tree = x[chromosome] tree.add_interval(Interval(start, end, value={'name': name})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name})) x[chromosome] = tree ## Exclusion regions if exclusionSize > 0: if chromosome in x_ex: tree_ex = x_ex[chromosome] tree_ex.add_interval( Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval( Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) else: tree_ex = Intersecter() tree_ex.add_interval( Interval(start - int(exclusionSize), start, value={'name': str(name) + "_up"})) tree_ex.add_interval( Interval(end, end + int(exclusionSize), value={'name': str(name) + "_dwn"})) x_ex[chromosome] = tree_ex bed_handle.close() return (x, x_ex)
mdict = {} for i in gffs: k = gffs.index(i) syn_region[k] = {} for y in i: if k == 0: if y.name.startswith('GRMZM'): y.name = y.name.split('.')[0] else: y.name = '_'.join(y.name.split('.')[:2]) if k == 1 or k == 2: y.name = '.'.join(y.name.split('.')[:2]) if y[2] != 'gene': continue if y.name not in syn[k]: continue if y[0] not in syn_region[k]: syn_region[k][y[0]] = Intersecter() syn_region[k][y[0]].add_interval(Interval(int(y[3]), int(y[4]), y.name)) if y.name not in mdict: mdict[y.name] = [] # if '_' in y.name:continue # if 'scaffold' in y.chrom:continue mdict[y.name].append(y.chrom) mdict[y.name].append(int(y[3])) mdict[y.name].append(int(y[4])) mdict[y.name].append(y.strand) cand = open(sys.argv[2]) count = 0 for line in cand: count += 1
chrom = options.region.split(':')[0] start,end = map(int,options.region.split(':')[1].replace(",","").split("-")) outchrom = chrom outchrom = outchrom.replace("chr","") if outchrom.startswith('gi|'): # gi|224589803|ref|NC_000012.11| NCfield = outchrom.split("|")[-2] if NCfield.startswith("NC"): outchrom = "%d"%(int(NCfield.split(".")[0].split("_")[-1])) if options.verbose: sys.stderr.write("Coordinates parsed: Chrom %s Start %d End %d\n"%(chrom,start,end)) except: sys.stderr.write("Error: Region not defined in a correct format!") sys.exit() posRange = defaultdict(lambda:[0,0,0]) filteredReads = Intersecter() for read in readIterator(args,options): if read.is_duplicate or read.is_qcfail or read.is_unmapped: continue if isSoftClipped(read.cigar): continue if read.is_paired: if read.mate_is_unmapped: continue if read.rnext != read.tid: continue if read.is_read1 or (not options.pipe and read.is_read2 and read.pnext+read.qlen < start): if read.isize == 0: continue if options.downsample != None and random.random() >= options.downsample: continue if options.random: rstart = min(read.pos,read.pnext)+1+random.randint(-5,5) # 1-based rend = rstart+abs(read.isize)-1+random.randint(-5,5) # end included else: rstart = min(read.pos,read.pnext)+1 # 1-based
trees = dict() with gzip.open(args.subject) if args.subject.endswith('.gz') else open( args.subject) as fh: genome = '' for line in fh: if line.isspace() or line[0] == '#': continue data = line.rstrip().split('\t') if len(data) != 9: sys.stderr.write('number of columns != 9: {}'.format(line)) g, start, end, strand = data[0], int(data[3]), int(data[4]), data[6] if g != genome: genome = g trees[genome] = Intersecter() if strand == '+': start -= args.extend_upstream end += args.extend_downstream else: start -= args.extend_downstream end += args.extend_upstream if not args.embeded and strand == '-': # complement strand start, end = -end, -start trees[genome].add_interval(Interval(start, end, value=data)) if args.split: if args.split_dir is None: outdir = '{}.intersect@{}'.format(
def read_chain_file(chain_file, print_table=False): ''' Read chain file. Parameters ---------- chain_file : file Chain format file. Input chain_file could be either plain text, compressed file (".gz",".Z", ".z", ".bz", ".bz2", ".bzip2"), or a URL pointing to the chain file ("http://","https://", "ftp://"). If url was used, chain file must be plain text. print_table : bool, optional Print mappings in human readable table. Returns ------- maps : dict Dictionary with source chrom name as key, IntervalTree object as value. An IntervalTree contains many intervals. An interval is a start and end position and a value. eg. Interval(11, 12, strand="-", value = "abc") target_chromSize : dict Chromosome sizes of target genome source_chromSize : dict Chromosome sizes of source genome ''' logging.info("Read the chain file \"%s\" " % chain_file) maps = {} target_chromSize = {} source_chromSize = {} if print_table: blocks = [] for line in ireader.reader(chain_file): # Example: chain 4900 chrY 58368225 + 25985403 25985638 chr5 151006098 - 43257292 43257528 1 if not line.strip(): continue line = line.strip() if line.startswith(('#', ' ')): continue fields = line.split() if fields[0] == 'chain' and len(fields) in [12, 13]: #score = int(fields[1]) # Alignment score source_name = fields[2] # E.g. chrY source_size = int(fields[3]) # Full length of the chromosome source_strand = fields[4] # Must be + if source_strand != '+': raise Exception( "Source strand in a chain file must be +. (%s)" % line) source_start = int(fields[5]) # Start of source region #source_end = int(fields[6]) # End of source region target_name = fields[7] # E.g. chr5 target_size = int(fields[8]) # Full length of the chromosome target_strand = fields[9] # + or - target_start = int(fields[10]) #target_end = int(fields[11]) target_chromSize[target_name] = target_size source_chromSize[source_name] = source_size if target_strand not in ['+', '-']: raise Exception("Target strand must be - or +. (%s)" % line) #chain_id = None if len(fields) == 12 else fields[12] if source_name not in maps: maps[source_name] = Intersecter() sfrom, tfrom = source_start, target_start # Now read the alignment chain from the file and store it as a list (source_from, source_to) -> (target_from, target_to) elif fields[0] != 'chain' and len(fields) == 3: size, sgap, tgap = int(fields[0]), int(fields[1]), int(fields[2]) if print_table: if target_strand == '+': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, tfrom, tfrom + size, target_strand)) elif target_strand == '-': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, target_size - (tfrom + size), target_size - tfrom, target_strand)) if target_strand == '+': maps[source_name].add_interval( Interval( sfrom, sfrom + size, (target_name, tfrom, tfrom + size, target_strand))) elif target_strand == '-': maps[source_name].add_interval( Interval(sfrom, sfrom + size, (target_name, target_size - (tfrom + size), target_size - tfrom, target_strand))) sfrom += size + sgap tfrom += size + tgap elif fields[0] != 'chain' and len(fields) == 1: size = int(fields[0]) if print_table: if target_strand == '+': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, tfrom, tfrom + size, target_strand)) elif target_strand == '-': blocks.append( (source_name, sfrom, sfrom + size, source_strand, target_name, target_size - (tfrom + size), target_size - tfrom, target_strand)) if target_strand == '+': maps[source_name].add_interval( Interval( sfrom, sfrom + size, (target_name, tfrom, tfrom + size, target_strand))) elif target_strand == '-': maps[source_name].add_interval( Interval(sfrom, sfrom + size, (target_name, target_size - (tfrom + size), target_size - tfrom, target_strand))) else: raise Exception("Invalid chain format. (%s)" % line) #if (sfrom + size) != source_end or (tfrom + size) != target_end: # raise Exception("Alignment blocks do not match specified block sizes. (%s)" % header) if print_table: for i in blocks: print('\t'.join([str(n) for n in i])) return (maps, target_chromSize, source_chromSize)
def compare_two_transcripts(trans1, trans2, transcript_dict, afe=False): """ Returns the splice differences between two transcripts. Single exon-comparisons are ignored. Parameters ---------- trans1 : string transcript of interest trans2 : string second transcript of interest transcript_dict : a dictionary of transcript names with values being a list of exons afe : bool whether to include alternate start and ends :TODO make a better return :TODO maybe include something similar to to_plot Returns ------- Exclusive Junctions : 5' upstream exons : 3' downstram exons : Skipped Exons : Diffevent """ # TODO refactor this t1 = transcript_dict[trans1] t2 = transcript_dict[trans2] tree = Intersecter() starts1 = [i[0] for i in t1] starts2 = [i[0] for i in t2] reverse = False if min(starts1) <= min(starts2): s1 = t1 s2 = t2 s2_beg = min(starts2) else: s1 = t2 s2 = t1 reverse = True s2_beg = min(starts1) if reverse: torder = (trans2, trans1) else: torder = (trans1, trans2) # Ignore single-exon stuff if len(s1) <= 1 or len(s2) <= 1: return([], []) for i in s1: tree.add_interval(Interval(int(i[0]), int(i[1]), value={'anno':i[2]})) matching_exons = [] exclusive_juncs = [] skipped_exons = [] altends = [] exon_match = {} s1.sort(key=lambda x: x[0]) s2.sort(key=lambda x: x[0]) max_exon_1 = s1[-1][2] max_exon_2 = s2[-1][2] #end_position_s2 = max([i[1] for i in s2]) s1_end = max([i[1] for i in s1]) prev_match = None if max_exon_1 < s1[0][2]: strand = -1 else: strand = 1 for pcurr in range(len(s2)): start, end, exon_n = s2[pcurr] overlap = tree.find(int(start), int(end)) if len(overlap) == 0: if prev_match and (start < s1_end): #skipped exons cigar = _generate_cigar(s2, pcurr, mskip=1) try: if exon_match[exon_n - strand] == prev_match.value['anno']: try: nm = tree.find(*s2[pcurr + 1][0:2])[0] ocigar = [(3, nm.start - prev_match.end)] nexon = nm.value['anno'] except IndexError: nm=s1[_get_by_exonn(prev_match.value['anno']+strand,s1)] ocigar = [(3,nm[0] - prev_match.end)] nexon = nm[2] skipped_exons.append(DiffEvent('skipped_exon', start, end, torder, cigar2=cigar, cigar1 = ocigar, exon_num = (None, exon_n), exon2=(prev_match.value['anno'], nexon)) ) except KeyError: # Multiple skipped exons ncig = _generate_cigar(s2, pcurr, mskip=1)[1:] skipped_exons[-1]._extend(ncig, cig=2) elif start > s1_end: if prev_match: cigar = _generate_cigar(s2, pcurr, mskip=1) pm = tree.find(*s2[pcurr - 1][0:2])[0] pexon = pm.value['anno'] ocigar = [] for i in range(pexon, max_exon_1+strand, strand): narg = _get_by_exonn(i, s1) ocigar.append((0, s1[narg][0], s1[narg][1])) try: ocigar.append((3, s1[narg][1] - s1[narg+1][0])) except IndexError: pass #:TODO extend ocigar till end? altends.append(DiffEvent('AE', start, end, torder, cigar2=cigar, cigar1=ocigar, exon_num = (None, exon_n))) else: pass else: # Alternate start site that starts in between exons # of other transcript cigar = _generate_cigar(s2, pcurr, mskip=1)[:-1] try: nm = tree.find(*s2[pcurr + 1][0:2])[0] except IndexError: from IPython import embed embed() nexon = nm.value['anno'] narg = _get_by_exonn(nexon - strand, s1) pmatch = s1[narg] ocigar = [(0, pmatch[1] - pmatch[0]), (3, nm.start - pmatch[1])] altends.append(DiffEvent('AS', start, end, torder, cigar2=cigar, cigar1 = ocigar)) elif len(overlap) == 1: if start == overlap[0].start and end == overlap[0].end: s1_exon_n = overlap[0].value['anno'] matching_exons.append((start, end, (s1_exon_n, exon_n), (0, 0))) if prev_match: if s1_exon_n - prev_match.value['anno'] == strand: pass else: # Difference in exon matches mskip = abs(s1_exon_n - prev_match.value['anno'] ) - 1 narg = _get_by_exonn(prev_match.value['anno']+strand, s1) s_s1 = s1[narg] # skipped s1 cigar = _generate_cigar(s1, narg, mskip=mskip) ocigar = [(3, start - s2[pcurr-1][1])] # Remove previous one skipped_exons.append( DiffEvent('skipped_exon', s_s1[0], s_s1[1], torder, cigar2 = ocigar, cigar1 = cigar, exon_num = (s_s1[2], None), exon2 = (exon_n-strand, exon_n))) prev_match = overlap[0] else: sstart = min(start, overlap[0].start) ssend = max(end, overlap[0].end) # Ignore 5' or 3' differences if (exon_n == max_exon_2 and overlap[0].value['anno'] == max_exon_1): if end == overlap[0].end: prev_match = overlap[0] else: exclusive_juncs.append( (sstart, ssend, (overlap[0].value['anno'], exon_n), (overlap[0].start - start, overlap[0].end - end) )) # Deal with partial matches prev_match = overlap[0] exon_match[exon_n] = int(overlap[0].value['anno']) else: pass skipped_exons = EventCollection(transcript_ids = [s1, s2], events=skipped_exons) skipped_exons.events.extend(altends) return(matching_exons, skipped_exons)
if options.pipe: ## if -p specified, pass file along pipe without writing outfile = sys.stdout elif options.outdir != None: ## otherwise, write to output file outfilepath = os.path.join(os.getcwd(), options.outdir + "/" + outfilename) directory = os.path.dirname(outfilepath) ## get the directory full path if not os.path.exists(directory): ## check if it exists os.makedirs(directory) ## if not, create the directory outfile = open(outfilepath, "w+") ## write the file to the directory else: outfile = open(outfilename, "w+") ## otherwise, write to current dir """ Handle data """ total_reads = 0 start_end_list = Intersecter( ) ## list to hold start/end of each read in infile init_pos = 0 ## for defining region if user does not final_pos = 0 ## for defining region if user does not read_start = 0 ## start of each read read_end = 0 ## end of each read other_read_end = 0 """ Main functionality: """ for read in infile: total_reads += 1 ## get start position of the first read
def generate(x): "Generates random interval over a size and span" lo = randint(10000, SIZE) hi = lo + randint(1, randint(1, 10**4)) return (lo, hi) def generate_point(x): lo = randint(10000, SIZE) return (lo, lo) # use this to force both examples to generate the same data seed(10) # generate 10 thousand random intervals data = map(generate, xrange(N)) # generate the intervals to query over query = map(generate_point, xrange(1000)) # create the interval tree tree = Intersecter() # build an interval tree from the rest of the data for start, end in data: tree.add_interval( Interval(start, end) ) # perform the query for q, q in query: overlap = tree.find(q, q) out = [ (x.start, x.end) for x in overlap ] print '(%s) -> %s' % (q, out)