def exon_matching( exon_tree: IntervalTree, ref_exon: Interval, match_extend_tolerate_left: int, match_extend_tolerate_right: int, intervals_adjacent: bool = True, ) -> List[IntervalTree]: """ exon_tree --- an IntervalTree made from .baseC/.altC using exon detection; probably only short read data ref_exon --- an Interval representing an exon; probably from PacBio match_extend_tolerate --- maximum difference between the matched start/end find a continuous exon path (consisting of 1 or more nodes for which the intervals must be adjacent) in exon_tree that matches to ref_exon """ matches = exon_tree.find(ref_exon.start, ref_exon.end) if len(matches) == 0: # likely due to very low coverage on transcript return None # check that all matches are adjacent (no splicing! this just one integral exon) if (not intervals_adjacent) or c_branch.intervals_all_adjacent(matches): # check if the ends differ a little, if so, extend to min/max for i in range(len(matches)): d_start = abs(matches[i].start - ref_exon.start) # print "matching {0} to {1}".format(matches[i].start, ref_exon.start) # pdb.set_trace() if (d_start <= match_extend_tolerate_left ): # now find the furthest end that satisfies the results for j in range(len(matches) - 1, i - 1, -1): if (abs(matches[j].end - ref_exon.end) <= match_extend_tolerate_right): return matches[i:(j + 1)] return None else: # ack! could not find evidence for this :< return None
def test_empty(self): iv = IntervalTree() self.assertEqual([], iv.find(100, 300)) self.assertEqual([], iv.after(100)) self.assertEqual([], iv.before(100)) self.assertEqual([], iv.after_interval(100)) self.assertEqual([], iv.before_interval(100)) self.assertEqual([], iv.upstream_of_interval(100)) self.assertEqual([], iv.downstream_of_interval(100)) self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
def test_empty(self): iv = IntervalTree() self.assertEqual([], iv.find(100, 300)) self.assertEqual([], iv.after(100)) self.assertEqual([], iv.before(100)) self.assertEqual([], iv.after_interval(100)) self.assertEqual([], iv.before_interval(100)) self.assertEqual([], iv.upstream_of_interval(100)) self.assertEqual([], iv.downstream_of_interval(100)) self.assertEqual(None, iv.traverse(lambda x: x.append(1)))
def plot_coverage(coords, bams): '''Given the name of a DNA coordinates firl and a list of bam file names, plot the read aligment coverage for each bam file for each coordinate. One graph per coordinate will be generated. The coverage for each BAM file for a given coordinate will be plotted on the same graph. The coordinates file should be in TSV format.''' coords = get_coords(coords) for chrom, start, end in coords: logging.info("processing coord {} {} {}".format(chrom, start, end)) # Start plotting the graph and generate a name for the output file graph_filename = start_graph(chrom, start, end) coords_range = range(start, end + 1) for bam_filename in bams: # interval tree tracks the start and end mapped coordinates # of each read in the bam file that lies within our region # of interest. interval_tree = IntervalTree() with pysam.Samfile(bam_filename, "rb") as bam: logging.info("processing bam file {}".format(bam_filename)) # Collect all the reads from the BAM file which lie in # the region of interest. # fetch uses 0-based indexing. Our input coordinates are # in 1-based coordinates. reads = bam.fetch(chrom, start - 1, end - 1) # Insert the start and end of each aligned read into the # interval tree. for read in reads: if len(read.positions) > 0: # Add 1 to convert from 0-based to 1-based coordinates first_pos = read.positions[0] + 1 last_pos = read.positions[-1] + 1 interval_tree.add(first_pos, last_pos, None) # For each base position in our region of interest, # count the number of reads which overlap this position. # This computes the coverage for each position in the region. counts = [ len(interval_tree.find(pos, pos)) for pos in coords_range ] # Plot the coverage information for this bam file legend_text = bam_name_legend(bam_filename) plot_graph(counts, coords_range, legend_text) # Close the drawing of the graph for this set of coordinates end_graph(graph_filename)
def plot_coverage(coords, bams): '''Given the name of a DNA coordinates firl and a list of bam file names, plot the read aligment coverage for each bam file for each coordinate. One graph per coordinate will be generated. The coverage for each BAM file for a given coordinate will be plotted on the same graph. The coordinates file should be in TSV format.''' coords = get_coords(coords) for chrom, start, end in coords: logging.info("processing coord {} {} {}".format(chrom, start, end)) # Start plotting the graph and generate a name for the output file graph_filename = start_graph(chrom, start, end) coords_range = range(start, end+1) for bam_filename in bams: # interval tree tracks the start and end mapped coordinates # of each read in the bam file that lies within our region # of interest. interval_tree = IntervalTree() with pysam.Samfile(bam_filename, "rb") as bam: logging.info("processing bam file {}".format(bam_filename)) # Collect all the reads from the BAM file which lie in # the region of interest. # fetch uses 0-based indexing. Our input coordinates are # in 1-based coordinates. reads = bam.fetch(chrom, start-1, end-1) # Insert the start and end of each aligned read into the # interval tree. for read in reads: if len(read.positions) > 0: # Add 1 to convert from 0-based to 1-based coordinates first_pos = read.positions[0] + 1 last_pos = read.positions[-1] + 1 interval_tree.add(first_pos, last_pos, None) # For each base position in our region of interest, # count the number of reads which overlap this position. # This computes the coverage for each position in the region. counts = [len(interval_tree.find(pos, pos)) for pos in coords_range] # Plot the coverage information for this bam file legend_text = bam_name_legend(bam_filename) plot_graph(counts, coords_range, legend_text) # Close the drawing of the graph for this set of coordinates end_graph(graph_filename)
def parse_blast(blast_str, qfeat): """takes a blast file and cns_pair and sees if the query cns intersects with any of the cns found""" scns_inteval = IntervalTree() for line in blast_str.split("\n"): if "WARNING" in line: continue if "ERROR" in line: continue line = line.split("\t") locus = map(int, line[6:10]) locus.extend(map(float, line[10:])) s_start, s_end = locus[:2] s_start = min(int(s_start), int(s_end)) s_end = max(int(s_start), int(s_end)) scns_inteval.insert_interval(Interval(s_start, s_end)) q_start = min(int(qfeat['start']), int(qfeat['end'])) q_end = max(int(qfeat['start']), int(qfeat['end'])) intersecting_cns = scns_inteval.find(q_start, q_end) return intersecting_cns
class IntervalTreeOverlapDetector(OverlapDetector): def __init__(self, excludedSegments=None): from bx.intervals.intersection import IntervalTree self._intervalTree = IntervalTree() if excludedSegments: for start, end in excludedSegments: self._intervalTree.add(start, end) def overlaps(self, start, end): return bool(self._intervalTree.find(start, end)) def addSegment(self, start, end): self._addElementHandleBxPythonZeroDivisionException(start, end) # self._intervalTree.add(start, end) def _addElementHandleBxPythonZeroDivisionException(self, start, end, nrTries=10): """ DivisionByZero error is caused by a bug in the bx-python library. It happens rarely, so we just execute the add command again up to nrTries times when it does. If it pops up more than 10 times, we assume something else is wrong and raise. """ cnt = 0 while True: cnt += 1 try: self._intervalTree.add(start, end) except Exception as e: from gold.application.LogSetup import logMessage, logging logMessage("Try nr %i. %s" % (cnt, str(e)), level=logging.WARN) if cnt > nrTries: raise e continue else: break
def resolve_conflicts(pfam_hit_dict,minDomSize = 9,verbose=False): ''' :param pfam_hit_dict: dictionary of hits for the gene in the following format hit start,hit end : int hit id : str score, model coverage percent : float {(hit start,hit end):('hit id',score,model coverage percent)} :param minDomSize: int, the minimum window size that will be considered a domain :return: a sorted dictionary with the position of the hit as the keys and ('hit id',score,model coverage percent) ''' # initialize output gene_hits = SortedDict() redoFlag = True while redoFlag: if verbose: print("Sorting through intervals", pfam_hit_dict) redoFlag = False intervals_scores = [(key,value[1]) for key,value in pfam_hit_dict.items()] # sort intervals from pfam hits by score and place the highest score first intervals_scores.sort(key=itemgetter(1),reverse=True) # initialize intersect tree for quick overlap search intersectTree = IntervalTree() #add the intervals with the highest scores first for (interval,score) in intervals_scores: intervalStart = interval[0] intervalEnd = interval[1] intervalLength = intervalEnd-intervalStart+1 # if the interval is less than the minimum domain size don't bother if intervalLength > minDomSize: intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)] overLapFlag = False # for every interval that you're adding resolve the overlapping intervals while len(intersectingIntervals) > 0 and intervalLength > 1: start,end = intersectingIntervals[0] # interval completely covers existing coverage, break up into two intervals and redo the process if (intervalStart < start and intervalEnd > end): if verbose: print("Split Interval", interval,intersectingIntervals, pfam_hit_dict[interval]) left_scale = calculate_window((intervalStart,start-1))/intervalLength right_scale = calculate_window((end+1,intervalEnd))/intervalLength pfam_hit_dict[(intervalStart,start-1)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], pfam_hit_dict[interval][2] * left_scale) pfam_hit_dict[(end+1,intervalEnd)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], pfam_hit_dict[interval][2] * right_scale) # delete original hit and iterate del pfam_hit_dict[interval] redoFlag = True break else: #completely in the interval if (intervalStart >= start and intervalEnd <= end): #if completely overlapping then ignore since we already sorted by score overLapFlag = True break #intersection covers the left hand side of the interval elif intervalStart >= start: intervalStart = end + 1 #intersection covers the right hand side of the interval elif intervalEnd <= end: intervalEnd = start - 1 # recalculate the interval length and see if there are still intersecting intervals intervalLength = intervalEnd-intervalStart+1 intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)] if redoFlag: if verbose: print("Exiting For Loop to Reinitialize",pfam_hit_dict) break # if loop did not break because of an overlap add the annotation after resolving overlap, # check for minimum length after you merge intervals elif not overLapFlag and intervalLength > minDomSize: if verbose: print("Adding Hit",(intervalStart,intervalEnd),pfam_hit_dict[interval][0]) # scale the hitCoverage based on the reduction this works since interval is a tuple and isn't mutated hitCoverage = pfam_hit_dict[interval][2]*(intervalLength/(interval[1]-interval[0]+1.)) gene_hits[(intervalStart,intervalEnd)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], hitCoverage) intersectTree.add_interval(Interval(float(intervalStart),intervalEnd)) if verbose: print("Merging Hits") # Merge Windows Right Next to one another that have the same pFam ID, # redoFlag: need to restart the process after a successful merge redoFlag = True while redoFlag: for idx in range(len(gene_hits)-1): left_hit = gene_hits.keys()[idx] right_hit = gene_hits.keys()[idx+1] left_window_size = calculate_window(left_hit) right_window_size = calculate_window(right_hit) merged_window_size = calculate_window((left_hit[0],right_hit[1])) new_coverage = (gene_hits[left_hit][2] + gene_hits[right_hit][2])*\ (left_window_size+ right_window_size)/merged_window_size # Will merge a hit under the following conditions: # 1. Gap between the two hits is less than the minimum domain # 2. Cumulative coverage of the two hits is less than 1 (this avoids merging repeats together) if right_hit[0]-left_hit[1] < minDomSize and gene_hits[left_hit][0] == gene_hits[right_hit][0] \ and new_coverage < 1: gene_hits[(left_hit[0],right_hit[1])] = (gene_hits[left_hit][0], left_window_size/merged_window_size * gene_hits[left_hit][1] + right_window_size/merged_window_size * gene_hits[right_hit][1], new_coverage) redoFlag = True del gene_hits[left_hit] del gene_hits[right_hit] if verbose: print("Merged", left_hit,right_hit) break else: redoFlag = False if verbose: print("Deleting Domains Under Minimum Domain Size") # Finally check if any of the domains are less than the minimum domain size keysToDelete = [coordinates for coordinates in gene_hits.keys() if calculate_window(coordinates) < minDomSize] for key in keysToDelete: del gene_hits[key] if verbose: print("Deleting",key) if verbose: print("Final Annotation", gene_hits) return gene_hits
def annotate_igrs(genome, igr_df): """ Annotate the inter-genic regions listed in a dataframe with any available annotations from Rfam Parameters ---------- genome: src.data.rfam_db.Genome The genome object for the organism who's IGR's are being analyzed igr_df: pandas.Dataframe The dataframe with the columns 'accession', 'start', 'end', 'length', 'gc' Returns ------- annotated_igr_df: pandas.Dataframe """ # Initialize connection to Rfam database session = rfam_session() # Get the list of "rfamseq_acc" numbers for a given organism rfamseq_acc_list = session.query(t_genseq.c.rfamseq_acc).filter( t_genseq.c.upid == genome.upid).distinct().all() # Create a list to store all the interval trees annotation_tree_dict = {} for rfamseq_acc in rfamseq_acc_list: # Pull rfamseq_acc out of the list rfamseq_acc = rfamseq_acc[0] rna_query = session.query(t_full_region).filter( t_full_region.c.rfamseq_acc == rfamseq_acc) rna_list = rna_query.all() # Make an interval tree for all of the RNA annotations to allow for rapid overlap search annotation_tree = IntervalTree() # Go though and add each RNA annotation to the interval tree for rna in rna_list: start = min(rna.seq_start, rna.seq_end) end = max(rna.seq_start, rna.seq_end) annotation_interval = Interval(start=start, end=end, chrom=rna.rfamseq_acc, value=rna) annotation_tree.insert_interval(annotation_interval) rfamseq_acc_stripped = rfamseq_acc.partition('.')[0] annotation_tree_dict[rfamseq_acc_stripped] = annotation_tree # Make an empty list of all the igrs with annotations annotated_igr_list = [] for accession, accession_igr_df in igr_df.groupby('accession'): # Lookup the RNA annotation tree for the given accession try: annotation_tree = annotation_tree_dict[accession] except KeyError: print("IGR dataframe key: {} not found. Available keys are: {}". format(accession, annotation_tree_dict.keys())) # For each IGR find all of the overlaps with annotated RNAs for igr in accession_igr_df.itertuples(): overlap_list = annotation_tree.find(igr.start, igr.end) for overlap in overlap_list: # Add the IGR to the annotated_igr_list annotated_igr_list.append({ 'igr_index': igr[0], 'rfam_acc': overlap.value.rfam_acc }) # Convert annotated_igr_list into dataframe and merge on the rfam_acc annotated_igr_df = pd.merge(igr_df, pd.DataFrame(annotated_igr_list, columns=["igr_index", "rfam_acc"]), on="igr_index", how='left') # Look up the information for all of the RNA families represented in this genome rna_family_query = session.query(Family)\ .with_entities(Family.rfam_acc, Family.rfam_id, Family.description, Family.type)\ .filter(Family.rfam_acc.in_(annotated_igr_df["rfam_acc"].dropna().unique())) rna_families_df = pd.read_sql(rna_family_query.statement, rna_family_query.session.bind) merged_igr_df = pd.merge(annotated_igr_df, rna_families_df, on="rfam_acc", how="left") combined_descriptions = merged_igr_df.dropna().groupby("igr_index")\ .agg(dict(rfam_acc=lambda x: ','.join(set(x)), rfam_id=lambda x: ','.join(set(x)), type=lambda x: ','.join(set(x)), description=lambda x: '<br>'.join(set(x)))) merged_igr_df.drop_duplicates(["igr_index"], inplace=True) merged_igr_df.reset_index(inplace=True, drop=True) merged_igr_df.update(combined_descriptions) merged_igr_df["category"] = merged_igr_df.apply( lambda row: categorize_igr(row), axis=1) merged_igr_df["log_length"] = np.log(merged_igr_df["length"]) session.close() return merged_igr_df
def _bx(es): t = IntervalTree() for e in es: t.add(e[0], e[1], e) c = len(t.find(e[0], e[1]))
def _bx(es): t = IntervalTree() for e in es: t.add(e[0], e[1], e) c = len(t.find(e[0], e[1]))