Example #1
0
 def read_rep_region(self, regionfile):
     regions = HTSeq.GFF_Reader(regionfile, end_included=True)
     rep_tree = IntervalTree()
     for feature in regions:
         iv = feature.iv
         rep_tree.insert(iv, annotation='.')
     return rep_tree
Example #2
0
 def read_rep_region(self, regionfile):
     regions = HTSeq.GFF_Reader(regionfile, end_included=True)
     rep_tree = IntervalTree()
     for feature in regions:
         iv = feature.iv
         rep_tree.insert(iv, annotation='.')
     return rep_tree
Example #3
0
    def selectGeneGtf(self, gtf_file):
        # construct annotation tree
        # new_gtf file contains only exon annotation
        gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
        annotation_tree = IntervalTree()
        gtf_exon = []
        for feature in gtf:
            # Select only exon line
            if feature.type == 'exon':
                gtf_exon.append(feature.get_gff_line().split('\t'))

            iv = feature.iv
            try:
                row = feature.attr
                row['type'] = feature.type
            except:
                row = feature.get_gff_line()

            annotation_tree.insert(iv, annotation=row)

        gtf_exon_sorted = sorted(gtf_exon, key=lambda x: (x[0], int(x[3]), int(x[4])))
        gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted]
        new_gtf = open(self.tmp_dir + "tmp_" + os.path.basename(gtf_file) + '.exon.sorted', 'w')
        new_gtf.writelines(gtf_exon_sorted)
        new_gtf.close()
        return annotation_tree
Example #4
0
    def selectGeneGtf(self, gtf_file):
        # construct annotation tree
        # new_gtf file contains only exon annotation
        gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
        annotation_tree = IntervalTree()
        gtf_exon = []
        for feature in gtf:
            # Select only exon line
            if feature.type == 'exon':
                gtf_exon.append(feature.get_gff_line().split('\t'))

            iv = feature.iv
            try:
                row = feature.attr
                row['type'] = feature.type
            except:
                row = feature.get_gff_line()

            annotation_tree.insert(iv, annotation=row)

        gtf_exon_sorted = sorted(gtf_exon,
                                 key=lambda x: (x[0], int(x[3]), int(x[4])))
        gtf_exon_sorted = ['\t'.join(s) for s in gtf_exon_sorted]
        new_gtf = open(
            self.tmp_dir + "tmp_" + os.path.basename(gtf_file) +
            '.exon.sorted', 'w')
        new_gtf.writelines(gtf_exon_sorted)
        new_gtf.close()
        return annotation_tree
Example #5
0
 def selectGeneGtf(self, gtf_file):
     # select gene features for gtf or gff annotation file
     gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
     annotation_tree = IntervalTree()
     for feature in gtf:
         # Select only exon line
         iv = feature.iv
         try:
             row = feature.attr
             row['type'] = feature.type
         except:
             row = feature.get_gff_line()
         annotation_tree.insert(iv, annotation=row)
     return annotation_tree
Example #6
0
 def selectGeneGtf(self,gtf_file):
     # select gene features for gtf or gff annotation file
     gtf = HTSeq.GFF_Reader(gtf_file, end_included=True)
     annotation_tree = IntervalTree()
     for feature in gtf:
         # Select only exon line
         iv = feature.iv
         try:
             row = feature.attr
             row['type'] = feature.type
         except:
             row = feature.get_gff_line()
         annotation_tree.insert(iv, annotation=row)
     return annotation_tree
    def intersectcirc(self,
                      circ_file,
                      modified_gtf_file,
                      strand=True,
                      isStartBED=True):
        # input the result file of print_start_end_file
        input_bed_file = open(circ_file).readlines()
        exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True)
        gtf_exon_sorted = IntervalTree()
        for feature in exon_gtf_file:
            row = feature.attr
            current_bed_interval = feature.iv
            gtf_exon_sorted.insert(current_bed_interval, annotation=row)

        circ_exon_set = {}
        for bed_line in input_bed_file:
            bed_field = bed_line.split('\t')
            custom_exon_list = []

            # we add 1bp in order for intersect to work correctly
            # different case for start or end bed file
            if isStartBED:
                start = int(bed_field[1])
                end = int(bed_field[1]) + 1
            else:
                start = int(bed_field[1]) - 1
                end = int(bed_field[1])

            # in order for the intersect to work, we need at least 1bp frame size
            current_bed_interval = HTSeq.GenomicInterval(
                bed_field[0], start, end, bed_field[5].strip())

            # for later processing however, we again need the "0" bp frame window
            insert_bed_interval = HTSeq.GenomicInterval(
                bed_field[0], int(bed_field[1]), int(bed_field[2]),
                bed_field[5].strip())
            # extract all customs exons
            gtf_exon_sorted.intersect(
                current_bed_interval, lambda x: custom_exon_list.append(
                    x.annotation['custom_exon_id']))

            if custom_exon_list:  # if we found one or more custom exons
                for custom_exon in custom_exon_list:  # go through the list
                    circ_exon_set.setdefault(insert_bed_interval, set()).add(
                        custom_exon)  # and add them to the set

        # return the filled set
        return circ_exon_set
    def intersectcirc(self, circ_file, modified_gtf_file, strand=True, isStartBED=True):
        # input the result file of print_start_end_file
        input_bed_file = open(circ_file).readlines()
        exon_gtf_file = HTSeq.GFF_Reader(modified_gtf_file, end_included=True)
        gtf_exon_sorted = IntervalTree()
        for feature in exon_gtf_file:
            row = feature.attr
            current_bed_interval = feature.iv
            gtf_exon_sorted.insert(current_bed_interval, annotation=row)

        circ_exon_set = {}
        for bed_line in input_bed_file:
            bed_field = bed_line.split('\t')
            custom_exon_list = []

            # we add 1bp in order for intersect to work correctly
            # different case for start or end bed file
            if isStartBED:
                start = int(bed_field[1])
                end = int(bed_field[1]) + 1
            else:
                start = int(bed_field[1]) - 1
                end = int(bed_field[1])

            # in order for the intersect to work, we need at least 1bp frame size
            current_bed_interval = HTSeq.GenomicInterval(bed_field[0],
                                                         start,
                                                         end,
                                                         bed_field[5].strip()
                                                         )

            # for later processing however, we again need the "0" bp frame window
            insert_bed_interval = HTSeq.GenomicInterval(bed_field[0],
                                                        int(bed_field[1]),
                                                        int(bed_field[2]),
                                                        bed_field[5].strip()
                                                        )
            # extract all customs exons
            gtf_exon_sorted.intersect(current_bed_interval,
                                      lambda x: custom_exon_list.append(x.annotation['custom_exon_id'])
                                      )

            if custom_exon_list:  # if we found one or more custom exons
                for custom_exon in custom_exon_list:  # go through the list
                    circ_exon_set.setdefault(insert_bed_interval, set()).add(custom_exon)  # and add them to the set

        # return the filled set
        return circ_exon_set