Example #1
0
    def next(self):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error(parse_error):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate,
                                                   "__call__"):
                    self.print_delegate(self.outstream, e, self)
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append(
                    (self.linenum, self.current_line, str(e)))

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next(self)
                except ParseError, e:
                    handle_parse_error(e)
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len(self.current_line)
Example #2
0
    def next( self ):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error( parse_error ):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
                    self.print_delegate( self.outstream, e, self )
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next( self )
                except ParseError, e:
                    handle_parse_error( e )
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len( self.current_line )
Example #3
0
    def next( self ):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error( parse_error ):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
                    self.print_delegate( self.outstream, e, self )
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next( self )
                except ParseError as e:
                    handle_parse_error( e )
                # TODO: When no longer supporting python 2.4 use finally:
                # finally:
                raw_size += len( self.current_line )

        # If header or comment, clear seed interval and return it with its size.
        if isinstance( self.seed_interval, ( Header, Comment ) ):
            return_val = self.seed_interval
            return_val.raw_size = len( self.current_line )
            self.seed_interval = None
            self.seed_interval_line_len = 0
            return return_val

        # Initialize feature identifier from seed.
        feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
        # For GFF3
        feature_id = self.seed_interval.attributes.get( 'ID', None )
        # For GTF.
        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )

        # Read all intervals associated with seed.
        feature_intervals = []
        feature_intervals.append( self.seed_interval )
        while True:
            try:
                interval = GenomicIntervalReader.next( self )
                raw_size += len( self.current_line )
            except StopIteration as e:
                # No more intervals to read, but last feature needs to be
                # returned.
                interval = None
                raw_size += len( self.current_line )
                break
            except ParseError as e:
                handle_parse_error( e )
                raw_size += len( self.current_line )
                continue
            # TODO: When no longer supporting python 2.4 use finally:
            # finally:
            # raw_size += len( self.current_line )

            # Ignore comments.
            if isinstance( interval, Comment ):
                continue

            # Determine if interval is part of feature.
            part_of = False
            group = interval.attributes.get( 'group', None )
            # GFF test:
            if group and feature_group == group:
                part_of = True
            # GFF3 test:
            parent_id = interval.attributes.get( 'Parent', None )
            cur_id = interval.attributes.get( 'ID', None )
            if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
                part_of = True
            # GTF test:
            transcript_id = interval.attributes.get( 'transcript_id', None )
            if transcript_id and transcript_id == feature_transcript_id:
                part_of = True

            # If interval is not part of feature, clean up and break.
            if not part_of:
                # Adjust raw size because current line is not part of feature.
                raw_size -= len( self.current_line )
                break

            # Interval associated with feature.
            feature_intervals.append( interval )

        # Last interval read is the seed for the next interval.
        self.seed_interval = interval
        self.seed_interval_line_len = len( self.current_line )

        # Return feature.
        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
                              self.end_col, self.strand_col, self.score_col,
                              self.default_strand, fix_strand=self.fix_strand,
                              intervals=feature_intervals, raw_size=raw_size )

        # Convert to BED coords?
        if self.convert_to_bed_coord:
            convert_gff_coords_to_bed( feature )

        return feature
Example #4
0
class GFFReaderWrapper(NiceReaderWrapper):
    """
    Reader wrapper for GFF files.

    Wrapper has two major functions:

    1. group entries for GFF file (via group column), GFF3 (via id attribute),
       or GTF (via gene_id/transcript id);
    2. convert coordinates from GFF format--starting and ending coordinates
       are 1-based, closed--to the 'traditional'/BED interval format--0 based,
       half-open. This is useful when using GFF files as inputs to tools that
       expect traditional interval format.
    """
    def __init__(self,
                 reader,
                 chrom_col=0,
                 feature_col=2,
                 start_col=3,
                 end_col=4,
                 strand_col=6,
                 score_col=5,
                 fix_strand=False,
                 convert_to_bed_coord=False,
                 **kwargs):
        NiceReaderWrapper.__init__(self,
                                   reader,
                                   chrom_col=chrom_col,
                                   start_col=start_col,
                                   end_col=end_col,
                                   strand_col=strand_col,
                                   fix_strand=fix_strand,
                                   **kwargs)
        self.feature_col = feature_col
        self.score_col = score_col
        self.convert_to_bed_coord = convert_to_bed_coord
        self.last_line = None
        self.cur_offset = 0
        self.seed_interval = None
        self.seed_interval_line_len = 0

    def parse_row(self, line):
        interval = GFFInterval(self,
                               line.split("\t"),
                               self.chrom_col,
                               self.feature_col,
                               self.start_col,
                               self.end_col,
                               self.strand_col,
                               self.score_col,
                               self.default_strand,
                               fix_strand=self.fix_strand)
        return interval

    def next(self):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error(parse_error):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate,
                                                   "__call__"):
                    self.print_delegate(self.outstream, e, self)
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append(
                    (self.linenum, self.current_line, str(e)))

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next(self)
                except ParseError, e:
                    handle_parse_error(e)
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len(self.current_line)

        # If header or comment, clear seed interval and return it with its size.
        if isinstance(self.seed_interval, (Header, Comment)):
            return_val = self.seed_interval
            return_val.raw_size = len(self.current_line)
            self.seed_interval = None
            self.seed_interval_line_len = 0
            return return_val

        # Initialize feature identifier from seed.
        feature_group = self.seed_interval.attributes.get('group',
                                                          None)  # For GFF
        # For GFF3
        feature_id = self.seed_interval.attributes.get('ID', None)
        # For GTF.
        feature_transcript_id = self.seed_interval.attributes.get(
            'transcript_id', None)

        # Read all intervals associated with seed.
        feature_intervals = []
        feature_intervals.append(self.seed_interval)
        while True:
            try:
                interval = GenomicIntervalReader.next(self)
                raw_size += len(self.current_line)
            except StopIteration, e:
                # No more intervals to read, but last feature needs to be
                # returned.
                interval = None
                raw_size += len(self.current_line)
                break
            except ParseError, e:
                handle_parse_error(e)
                raw_size += len(self.current_line)
                continue