def read_rep_region(self, regionfile): regions = HTSeq.GFF_Reader(regionfile, end_included=True) rep_tree = IntervalTree() for feature in regions: iv = feature.iv rep_tree.insert(iv, annotation='.') return rep_tree
def selectGeneGtf(self, gtf_file): # construct annotation tree # new_gtf file contains only exon annotation gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() gtf_exon = [] for feature in gtf: # Select only exon line if feature.type == 'exon': gtf_exon.append(feature.get_gff_line().split('\t')) iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4]))) gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted] new_gtf = open(self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w') new_gtf.writelines(gtf_exon_sorted) new_gtf.close() return annotation_tree
def selectGeneGtf(self, gtf_file): # construct annotation tree # new_gtf file contains only exon annotation gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() gtf_exon = [] for feature in gtf: # Select only exon line if feature.type == 'exon': gtf_exon.append(feature.get_gff_line().split('\t')) iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4]))) gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted] new_gtf = open( self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w') new_gtf.writelines(gtf_exon_sorted) new_gtf.close() return annotation_tree
def selectGeneGtf(self, gtf_file): # select gene features for gtf or gff annotation file gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() for feature in gtf: # Select only exon line iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) return annotation_tree
def selectGeneGtf(self,gtf_file): # select gene features for gtf or gff annotation file gtf = HTSeq.GFF_Reader(gtf_file, end_included=True) annotation_tree = IntervalTree() for feature in gtf: # Select only exon line iv = feature.iv try: row = feature.attr row['type'] = feature.type except: row = feature.get_gff_line() annotation_tree.insert(iv, annotation=row) return annotation_tree
def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True): # input the result file of print_start_end_file input_bed_file = open(circ_file).readlines() exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True) gtf_exon_sorted = IntervalTree() for feature in exon_gtf_file: row = feature.attr current_bed_interval = feature.iv gtf_exon_sorted.insert(current_bed_interval, annotation=row) circ_exon_set = {} for bed_line in input_bed_file: bed_field = bed_line.split('\t') custom_exon_list = [] # we add 1bp in order for intersect to work correctly # different case for start or end bed file if isStartBED: start = int(bed_field[1]) end = int(bed_field[1]) + 1 else: start = int(bed_field[1]) - 1 end = int(bed_field[1]) # in order for the intersect to work, we need at least 1bp frame size current_bed_interval = HTSeq.GenomicInterval( bed_field[0], start, end, bed_field[5].strip()) # for later processing however, we again need the "0" bp frame window insert_bed_interval = HTSeq.GenomicInterval( bed_field[0], int(bed_field[1]), int(bed_field[2]), bed_field[5].strip()) # extract all customs exons gtf_exon_sorted.intersect( current_bed_interval, lambda x: custom_exon_list.append( x.annotation['custom_exon_id'])) if custom_exon_list: # if we found one or more custom exons for custom_exon in custom_exon_list: # go through the list circ_exon_set.setdefault(insert_bed_interval, set()).add( custom_exon) # and add them to the set # return the filled set return circ_exon_set
def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True): # input the result file of print_start_end_file input_bed_file = open(circ_file).readlines() exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True) gtf_exon_sorted = IntervalTree() for feature in exon_gtf_file: row = feature.attr current_bed_interval = feature.iv gtf_exon_sorted.insert(current_bed_interval, annotation=row) circ_exon_set = {} for bed_line in input_bed_file: bed_field = bed_line.split('\t') custom_exon_list = [] # we add 1bp in order for intersect to work correctly # different case for start or end bed file if isStartBED: start = int(bed_field[1]) end = int(bed_field[1]) + 1 else: start = int(bed_field[1]) - 1 end = int(bed_field[1]) # in order for the intersect to work, we need at least 1bp frame size current_bed_interval = HTSeq.GenomicInterval(bed_field[0], start, end, bed_field[5].strip() ) # for later processing however, we again need the "0" bp frame window insert_bed_interval = HTSeq.GenomicInterval(bed_field[0], int(bed_field[1]), int(bed_field[2]), bed_field[5].strip() ) # extract all customs exons gtf_exon_sorted.intersect(current_bed_interval, lambda x: custom_exon_list.append(x.annotation['custom_exon_id']) ) if custom_exon_list: # if we found one or more custom exons for custom_exon in custom_exon_list: # go through the list circ_exon_set.setdefault(insert_bed_interval, set()).add(custom_exon) # and add them to the set # return the filled set return circ_exon_set