def next(self): """ Returns next GFFFeature. """ # # Helper function. # def handle_parse_error(parse_error): """ Actions to take when ParseError found. """ if self.outstream: if self.print_delegate and hasattr(self.print_delegate, "__call__"): self.print_delegate(self.outstream, e, self) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: self.skipped_lines.append( (self.linenum, self.current_line, str(e))) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception # can be found by uncommenting this. # raise e # # Get next GFFFeature # raw_size = self.seed_interval_line_len # If there is no seed interval, set one. Also, if there are no more # intervals to read, this is where iterator dies. if not self.seed_interval: while not self.seed_interval: try: self.seed_interval = GenomicIntervalReader.next(self) except ParseError, e: handle_parse_error(e) # TODO: When no longer supporting python 2.4 use finally: #finally: raw_size += len(self.current_line)
def next( self ): """ Returns next GFFFeature. """ # # Helper function. # def handle_parse_error( parse_error ): """ Actions to take when ParseError found. """ if self.outstream: if self.print_delegate and hasattr(self.print_delegate, "__call__"): self.print_delegate( self.outstream, e, self ) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception # can be found by uncommenting this. # raise e # # Get next GFFFeature # raw_size = self.seed_interval_line_len # If there is no seed interval, set one. Also, if there are no more # intervals to read, this is where iterator dies. if not self.seed_interval: while not self.seed_interval: try: self.seed_interval = GenomicIntervalReader.next( self ) except ParseError, e: handle_parse_error( e ) # TODO: When no longer supporting python 2.4 use finally: #finally: raw_size += len( self.current_line )
def next( self ): """ Returns next GFFFeature. """ # # Helper function. # def handle_parse_error( parse_error ): """ Actions to take when ParseError found. """ if self.outstream: if self.print_delegate and hasattr(self.print_delegate, "__call__"): self.print_delegate( self.outstream, e, self ) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception # can be found by uncommenting this. # raise e # # Get next GFFFeature # raw_size = self.seed_interval_line_len # If there is no seed interval, set one. Also, if there are no more # intervals to read, this is where iterator dies. if not self.seed_interval: while not self.seed_interval: try: self.seed_interval = GenomicIntervalReader.next( self ) except ParseError as e: handle_parse_error( e ) # TODO: When no longer supporting python 2.4 use finally: # finally: raw_size += len( self.current_line ) # If header or comment, clear seed interval and return it with its size. if isinstance( self.seed_interval, ( Header, Comment ) ): return_val = self.seed_interval return_val.raw_size = len( self.current_line ) self.seed_interval = None self.seed_interval_line_len = 0 return return_val # Initialize feature identifier from seed. feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF # For GFF3 feature_id = self.seed_interval.attributes.get( 'ID', None ) # For GTF. feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # Read all intervals associated with seed. feature_intervals = [] feature_intervals.append( self.seed_interval ) while True: try: interval = GenomicIntervalReader.next( self ) raw_size += len( self.current_line ) except StopIteration as e: # No more intervals to read, but last feature needs to be # returned. interval = None raw_size += len( self.current_line ) break except ParseError as e: handle_parse_error( e ) raw_size += len( self.current_line ) continue # TODO: When no longer supporting python 2.4 use finally: # finally: # raw_size += len( self.current_line ) # Ignore comments. if isinstance( interval, Comment ): continue # Determine if interval is part of feature. part_of = False group = interval.attributes.get( 'group', None ) # GFF test: if group and feature_group == group: part_of = True # GFF3 test: parent_id = interval.attributes.get( 'Parent', None ) cur_id = interval.attributes.get( 'ID', None ) if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ): part_of = True # GTF test: transcript_id = interval.attributes.get( 'transcript_id', None ) if transcript_id and transcript_id == feature_transcript_id: part_of = True # If interval is not part of feature, clean up and break. if not part_of: # Adjust raw size because current line is not part of feature. raw_size -= len( self.current_line ) break # Interval associated with feature. feature_intervals.append( interval ) # Last interval read is the seed for the next interval. self.seed_interval = interval self.seed_interval_line_len = len( self.current_line ) # Return feature. feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, self.score_col, self.default_strand, fix_strand=self.fix_strand, intervals=feature_intervals, raw_size=raw_size ) # Convert to BED coords? if self.convert_to_bed_coord: convert_gff_coords_to_bed( feature ) return feature
def getRegionsAndGroups(regionsFileName, onlyMultiplesOf=1): # reads a bed file containing the position # of genomic intervals # In case is hash sign '#' is found in the # file, this is considered as a delimiter # to split the heatmap into groups regions = [] regionsDict = OrderedDict() regionGroups = [(0, '')] prevInterval = None duplicates = 0 totalIntervals = 0 includedIntervals = 0 # drop some lines for ginterval in GenomicIntervalReader( open(regionsFileName, 'r').readlines()): totalIntervals += 1 if ginterval.__str__()[0] == '#': if includedIntervals > 1 and includedIntervals - regionGroups[-1][ 0] > 1: label = ginterval.__str__()[1:] newLabel = label if label in regionsDict.keys(): # loop to find a unique label name i = 0 while True: i += 1 newLabel = label + "_r" + str(i) if newLabel not in regionsDict.keys(): break regionsDict[newLabel] = regions[:] regions = [] continue # if the list of regions is to big, only consider a fraction of the data if totalIntervals % onlyMultiplesOf != 0: continue # skip regions that have the same position as the previous. # This assumes that the regions file given is sorted if prevInterval and prevInterval.chrom == ginterval.chrom and \ prevInterval.start == ginterval.start and \ prevInterval.end == ginterval.end: if args.verbose: print "Gene in same region already included: %s %s:%s-%s. Skipping" % ( ginterval.fields[3], ginterval.chrom, ginterval.start, ginterval.end) duplicates += 1 continue else: prevInterval = ginterval regions.append(intervalWrapper(ginterval)) includedIntervals += 1 if len(regions): regionsDict[args.regionsLabel] = regions if args.verbose: print "%d (%.2f) regions covering the exact same interval were found" % \ (duplicates, float(duplicates) *100 / totalIntervals) return regionsDict
Create a site profile vector showing the average signal accumulated from a bigwig file around the center of each interval from a BED file. Output is the average signal value at that relative position across the intervals. usage: %prog bigwig_file.bw padding < bed_file.bed """ import sys from numpy import * from bx.intervals.io import GenomicIntervalReader from bx.bbi.bigwig_file import BigWigFile bw = BigWigFile( open( sys.argv[1] ) ) padding = int( sys.argv[2] ) totals = zeros( padding*2, dtype=float64 ) valid = zeros( padding*2, dtype=int32 ) for interval in GenomicIntervalReader( sys.stdin ): center = floor( ( interval.start + interval.end ) / 2 ) values = bw.get_as_array( interval.chrom, center - padding, center + padding ) # Determine which positions had data and mask the rest for totalling invalid = isnan( values ) values[ invalid ] = 0 totals += values valid += ( ~ invalid ) savetxt( sys.stdout, totals/valid )
class GFFReaderWrapper(NiceReaderWrapper): """ Reader wrapper for GFF files. Wrapper has two major functions: 1. group entries for GFF file (via group column), GFF3 (via id attribute), or GTF (via gene_id/transcript id); 2. convert coordinates from GFF format--starting and ending coordinates are 1-based, closed--to the 'traditional'/BED interval format--0 based, half-open. This is useful when using GFF files as inputs to tools that expect traditional interval format. """ def __init__(self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, fix_strand=False, convert_to_bed_coord=False, **kwargs): NiceReaderWrapper.__init__(self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, strand_col=strand_col, fix_strand=fix_strand, **kwargs) self.feature_col = feature_col self.score_col = score_col self.convert_to_bed_coord = convert_to_bed_coord self.last_line = None self.cur_offset = 0 self.seed_interval = None self.seed_interval_line_len = 0 def parse_row(self, line): interval = GFFInterval(self, line.split("\t"), self.chrom_col, self.feature_col, self.start_col, self.end_col, self.strand_col, self.score_col, self.default_strand, fix_strand=self.fix_strand) return interval def next(self): """ Returns next GFFFeature. """ # # Helper function. # def handle_parse_error(parse_error): """ Actions to take when ParseError found. """ if self.outstream: if self.print_delegate and hasattr(self.print_delegate, "__call__"): self.print_delegate(self.outstream, e, self) self.skipped += 1 # no reason to stuff an entire bad file into memmory if self.skipped < 10: self.skipped_lines.append( (self.linenum, self.current_line, str(e))) # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception # can be found by uncommenting this. # raise e # # Get next GFFFeature # raw_size = self.seed_interval_line_len # If there is no seed interval, set one. Also, if there are no more # intervals to read, this is where iterator dies. if not self.seed_interval: while not self.seed_interval: try: self.seed_interval = GenomicIntervalReader.next(self) except ParseError, e: handle_parse_error(e) # TODO: When no longer supporting python 2.4 use finally: #finally: raw_size += len(self.current_line) # If header or comment, clear seed interval and return it with its size. if isinstance(self.seed_interval, (Header, Comment)): return_val = self.seed_interval return_val.raw_size = len(self.current_line) self.seed_interval = None self.seed_interval_line_len = 0 return return_val # Initialize feature identifier from seed. feature_group = self.seed_interval.attributes.get('group', None) # For GFF # For GFF3 feature_id = self.seed_interval.attributes.get('ID', None) # For GTF. feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None) # Read all intervals associated with seed. feature_intervals = [] feature_intervals.append(self.seed_interval) while True: try: interval = GenomicIntervalReader.next(self) raw_size += len(self.current_line) except StopIteration, e: # No more intervals to read, but last feature needs to be # returned. interval = None raw_size += len(self.current_line) break except ParseError, e: handle_parse_error(e) raw_size += len(self.current_line) continue
def main(): allchroms = False options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) lengths = options.lengths if options.all: allchroms = True in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) lens = dict() chroms = list() # dbfile is used to determine the length of each chromosome. The lengths # are added to the lens dict and passed copmlement operation code in bx. dbfile = fileinput.FileInput(lengths) if dbfile: if not allchroms: try: for line in dbfile: fields = line.split("\t") lens[fields[0]] = int(fields[1]) except: # assume LEN doesn't exist or is corrupt somehow pass elif allchroms: try: for line in dbfile: fields = line.split("\t") end = int(fields[1]) chroms.append("\t".join([fields[0], "0", str(end)])) except: pass # Safety...if the dbfile didn't exist and we're on allchroms, then # default to generic complement if allchroms and len(chroms) == 0: allchroms = False if allchroms: chromReader = GenomicIntervalReader(chroms) generator = subtract([chromReader, g1]) else: generator = complement(g1, lens) out_file = open(out_fname, "w") try: for interval in generator: if type(interval) is GenomicInterval: out_file.write("%s\n" % "\t".join(interval)) else: out_file.write("%s\n" % interval) except ParseError, exc: out_file.close() fail("Invalid file format: %s" % str(exc))