def setUp(self): iv = IntervalTree() iv.add_interval(Interval(50, 59)) for i in range(0, 110, 10): if i == 50: continue f = Interval(i, i + 9) iv.add_interval(f) self.intervals = iv
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i * i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i * i))) # or insert/add an interval object with start, end attrs. iv.insert_interval( Interval(i + 40, i + 50, value=dict(astr=str(i * i)))) iv.add_interval( Interval(i + 60, i + 70, value=dict(astr=str(i * i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i*i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i*i))) # or insert/add an interval object with start, end attrs. iv.insert_interval(Interval(i + 40, i + 50, value=dict(astr=str(i*i)))) iv.add_interval(Interval(i + 60, i + 70, value=dict(astr=str(i*i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def resolve_conflicts(pfam_hit_dict,minDomSize = 9,verbose=False): ''' :param pfam_hit_dict: dictionary of hits for the gene in the following format hit start,hit end : int hit id : str score, model coverage percent : float {(hit start,hit end):('hit id',score,model coverage percent)} :param minDomSize: int, the minimum window size that will be considered a domain :return: a sorted dictionary with the position of the hit as the keys and ('hit id',score,model coverage percent) ''' # initialize output gene_hits = SortedDict() redoFlag = True while redoFlag: if verbose: print("Sorting through intervals", pfam_hit_dict) redoFlag = False intervals_scores = [(key,value[1]) for key,value in pfam_hit_dict.items()] # sort intervals from pfam hits by score and place the highest score first intervals_scores.sort(key=itemgetter(1),reverse=True) # initialize intersect tree for quick overlap search intersectTree = IntervalTree() #add the intervals with the highest scores first for (interval,score) in intervals_scores: intervalStart = interval[0] intervalEnd = interval[1] intervalLength = intervalEnd-intervalStart+1 # if the interval is less than the minimum domain size don't bother if intervalLength > minDomSize: intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)] overLapFlag = False # for every interval that you're adding resolve the overlapping intervals while len(intersectingIntervals) > 0 and intervalLength > 1: start,end = intersectingIntervals[0] # interval completely covers existing coverage, break up into two intervals and redo the process if (intervalStart < start and intervalEnd > end): if verbose: print("Split Interval", interval,intersectingIntervals, pfam_hit_dict[interval]) left_scale = calculate_window((intervalStart,start-1))/intervalLength right_scale = calculate_window((end+1,intervalEnd))/intervalLength pfam_hit_dict[(intervalStart,start-1)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], pfam_hit_dict[interval][2] * left_scale) pfam_hit_dict[(end+1,intervalEnd)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], pfam_hit_dict[interval][2] * right_scale) # delete original hit and iterate del pfam_hit_dict[interval] redoFlag = True break else: #completely in the interval if (intervalStart >= start and intervalEnd <= end): #if completely overlapping then ignore since we already sorted by score overLapFlag = True break #intersection covers the left hand side of the interval elif intervalStart >= start: intervalStart = end + 1 #intersection covers the right hand side of the interval elif intervalEnd <= end: intervalEnd = start - 1 # recalculate the interval length and see if there are still intersecting intervals intervalLength = intervalEnd-intervalStart+1 intersectingIntervals = [(x.start,x.end) for x in intersectTree.find(intervalStart,intervalEnd)] if redoFlag: if verbose: print("Exiting For Loop to Reinitialize",pfam_hit_dict) break # if loop did not break because of an overlap add the annotation after resolving overlap, # check for minimum length after you merge intervals elif not overLapFlag and intervalLength > minDomSize: if verbose: print("Adding Hit",(intervalStart,intervalEnd),pfam_hit_dict[interval][0]) # scale the hitCoverage based on the reduction this works since interval is a tuple and isn't mutated hitCoverage = pfam_hit_dict[interval][2]*(intervalLength/(interval[1]-interval[0]+1.)) gene_hits[(intervalStart,intervalEnd)] = (pfam_hit_dict[interval][0], pfam_hit_dict[interval][1], hitCoverage) intersectTree.add_interval(Interval(float(intervalStart),intervalEnd)) if verbose: print("Merging Hits") # Merge Windows Right Next to one another that have the same pFam ID, # redoFlag: need to restart the process after a successful merge redoFlag = True while redoFlag: for idx in range(len(gene_hits)-1): left_hit = gene_hits.keys()[idx] right_hit = gene_hits.keys()[idx+1] left_window_size = calculate_window(left_hit) right_window_size = calculate_window(right_hit) merged_window_size = calculate_window((left_hit[0],right_hit[1])) new_coverage = (gene_hits[left_hit][2] + gene_hits[right_hit][2])*\ (left_window_size+ right_window_size)/merged_window_size # Will merge a hit under the following conditions: # 1. Gap between the two hits is less than the minimum domain # 2. Cumulative coverage of the two hits is less than 1 (this avoids merging repeats together) if right_hit[0]-left_hit[1] < minDomSize and gene_hits[left_hit][0] == gene_hits[right_hit][0] \ and new_coverage < 1: gene_hits[(left_hit[0],right_hit[1])] = (gene_hits[left_hit][0], left_window_size/merged_window_size * gene_hits[left_hit][1] + right_window_size/merged_window_size * gene_hits[right_hit][1], new_coverage) redoFlag = True del gene_hits[left_hit] del gene_hits[right_hit] if verbose: print("Merged", left_hit,right_hit) break else: redoFlag = False if verbose: print("Deleting Domains Under Minimum Domain Size") # Finally check if any of the domains are less than the minimum domain size keysToDelete = [coordinates for coordinates in gene_hits.keys() if calculate_window(coordinates) < minDomSize] for key in keysToDelete: del gene_hits[key] if verbose: print("Deleting",key) if verbose: print("Final Annotation", gene_hits) return gene_hits