def read_rep_region(self, regionfile): regions = HTSeq.GFF_Reader(regionfile, end_included=True) rep_tree = IntervalTree() for feature in regions: iv = feature.iv rep_tree.insert(iv, annotation='.') return rep_tree
def selectGeneGtf(self, gtf_file): # construct annotation tree # new_gtf file contains only exon annotation gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() gtf_exon = [] for feature in gtf: # Select only exon line if feature.type == 'exon': gtf_exon.append(feature.get_gff_line().split('\t')) iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4]))) gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted] new_gtf = open(self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w') new_gtf.writelines(gtf_exon_sorted) new_gtf.close() return annotation_tree
def selectGeneGtf(self, gtf_file): # construct annotation tree # new_gtf file contains only exon annotation gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() gtf_exon = [] for feature in gtf: # Select only exon line if feature.type == 'exon': gtf_exon.append(feature.get_gff_line().split('\t')) iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4]))) gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted] new_gtf = open( self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w') new_gtf.writelines(gtf_exon_sorted) new_gtf.close() return annotation_tree
def compute_scores(self, men, women, dmz, drt): # construct a score matrix n_row = len(men.rows) n_col = len(women.rows) dist_arr = lil_matrix((n_row, n_col)) weight_arr = lil_matrix((n_row, n_col)) max_dist = 0 T = IntervalTree(women.rows) for i in range(n_row): man = men.rows[i] mass_lower, mass_upper = man.get_mass_range(dmz, absolute_mass_tolerance=False) candidate_women = T.search(int(mass_lower), int(mass_upper)) for woman in candidate_women: if man.is_within_tolerance(woman, dmz, drt, absolute_mass_tolerance=False): dist, w = self.compute_dist(man, woman, dmz, drt) j = woman.row_id dist_arr[i, j] = dist weight_arr[i, j] = w if dist > max_dist: max_dist = dist try: # make this into a score matrix dist_arr = dist_arr.tocoo() score_arr = lil_matrix((n_row, n_col)) Q = lil_matrix((n_row, n_col)) max_score = 0 # see http://stackoverflow.com/questions/4319014/iterating-through-a-scipy-sparse-vector-or-matrix for i, j, v in itertools.izip(dist_arr.row, dist_arr.col, dist_arr.data): score = 1-(v/max_dist) score = weight_arr[i, j] * score score_arr[i, j] = score Q[i, j] = 1 if score > max_score: max_score = score # normalise score_arr = score_arr * (1/max_score) return score_arr, Q except ZeroDivisionError: dist_arr = dist_arr.tocoo() score_arr = lil_matrix((n_row, n_col)) Q = lil_matrix((n_row, n_col)) max_score = 0 for i, j, v in itertools.izip(dist_arr.row, dist_arr.col, dist_arr.data): score = 1-v score = weight_arr[i, j] * score score_arr[i, j] = score Q[i, j] = 1 if score > max_score: max_score = score return score_arr, Q
def selectGeneGtf(self, gtf_file): # select gene features for gtf or gff annotation file gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() for feature in gtf: # Select only exon line iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) return annotation_tree
def selectGeneGtf(self,gtf_file): # select gene features for gtf or gff annotation file gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() for feature in gtf: # Select only exon line iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) return annotation_tree
def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True): # input the result file of print_start_end_file input_bed_file = open(circ_file).readlines() exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True) gtf_exon_sorted = IntervalTree() for feature in exon_gtf_file: row = feature.attr current_bed_interval = feature.iv gtf_exon_sorted.insert(current_bed_interval, annotation=row) circ_exon_set = {} for bed_line in input_bed_file: bed_field = bed_line.split('\t') custom_exon_list = [] # we add 1bp in order for intersect to work correctly # different case for start or end bed file if isStartBED: start = int(bed_field[1]) end = int(bed_field[1]) + 1 else: start = int(bed_field[1]) - 1 end = int(bed_field[1]) # in order for the intersect to work, we need at least 1bp frame size current_bed_interval = HTSeq.GenomicInterval(bed_field[0], start, end, bed_field[5].strip() ) # for later processing however, we again need the "0" bp frame window insert_bed_interval = HTSeq.GenomicInterval(bed_field[0], int(bed_field[1]), int(bed_field[2]), bed_field[5].strip() ) # extract all customs exons gtf_exon_sorted.intersect(current_bed_interval, lambda x: custom_exon_list.append(x.annotation['custom_exon_id']) ) if custom_exon_list: # if we found one or more custom exons for custom_exon in custom_exon_list: # go through the list circ_exon_set.setdefault(insert_bed_interval, set()).add(custom_exon) # and add them to the set # return the filled set return circ_exon_set
def shiftReduceParse(linearTree, string): """ parse listed tree items from right to left (shift reduce) returns a tree or None, if some nodes are not aligned """ def isAligned(idx): alignment = [item for node in linearTree for item in node[2]] return idx in alignment treeBuffer = [] # check whether first and last word are aligned # if not then remove the tree lastIndex = len(string) if not isAligned(1): logging.info("Align first word to first semantic node") linearTree[0][2] += (1,) if not isAligned(lastIndex): logging.info("Align first word to first semantic node") linearTree[-1][2] += (lastIndex,) for node in reversed(linearTree): #print "buffer:", treeBuffer t = IntervalTree() t.name = node[0] # Add child nodes to the current node # by popping them from the buffer if node[1] == 0: pass else: for _ in range(node[1]): n = treeBuffer.pop() t.childNodes.append(n) # unaligned words if node[2] == (0,): return None if t.childNodes: minInterval = min(child.interval.start for child in t.childNodes) maxInterval = max(child.interval.end for child in t.childNodes) t.interval = Interval(minInterval,maxInterval) else: # what happens with leaf nodes that have no aligned semantic? t.interval = Interval() else: minInterval, maxInterval = min(node[2])-1, max(node[2]) for child in t.childNodes: childInterval = child.interval minInterval, maxInterval = min(minInterval,childInterval.start), max(maxInterval,childInterval.end) t.interval = Interval(minInterval, maxInterval) t.alignment = node[2] treeBuffer.append(t) return treeBuffer[0]
def generate_interval_tree_from_bed_file(regions_bed_path): tsv_handler = TsvHandler(regions_bed_path) # collect intervals from BED in illumina PG standards and convert to intervals that make sense: 0-based, closed bed_intervals_by_chromosome = tsv_handler.get_bed_intervals_by_chromosome(universal_offset=-1, start_offset=1) interval_trees_by_chromosome = dict() for chromosome in bed_intervals_by_chromosome: intervals = bed_intervals_by_chromosome[chromosome] interval_tree = IntervalTree(intervals) interval_trees_by_chromosome[chromosome] = interval_tree print("chromosomes: ", bed_intervals_by_chromosome.keys()) return interval_trees_by_chromosome
def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True): # input the result file of print_start_end_file input_bed_file = open(circ_file).readlines() exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True) gtf_exon_sorted = IntervalTree() for feature in exon_gtf_file: row = feature.attr current_bed_interval = feature.iv gtf_exon_sorted.insert(current_bed_interval, annotation=row) circ_exon_set = {} for bed_line in input_bed_file: bed_field = bed_line.split('\t') custom_exon_list = [] # we add 1bp in order for intersect to work correctly # different case for start or end bed file if isStartBED: start = int(bed_field[1]) end = int(bed_field[1]) + 1 else: start = int(bed_field[1]) - 1 end = int(bed_field[1]) # in order for the intersect to work, we need at least 1bp frame size current_bed_interval = HTSeq.GenomicInterval( bed_field[0], start, end, bed_field[5].strip()) # for later processing however, we again need the "0" bp frame window insert_bed_interval = HTSeq.GenomicInterval( bed_field[0], int(bed_field[1]), int(bed_field[2]), bed_field[5].strip()) # extract all customs exons gtf_exon_sorted.intersect( current_bed_interval, lambda x: custom_exon_list.append( x.annotation['custom_exon_id'])) if custom_exon_list: # if we found one or more custom exons for custom_exon in custom_exon_list: # go through the list circ_exon_set.setdefault(insert_bed_interval, set()).add( custom_exon) # and add them to the set # return the filled set return circ_exon_set