def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4, strand_col=6, score_col=5, default_strand='.', fix_strand=False ): # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, # temporarily set strand and then unset after initing GenomicInterval. unknown_strand = False if not fix_strand and fields[ strand_col ] == '.': unknown_strand = True fields[ strand_col ] = '+' GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, fix_strand=fix_strand ) if unknown_strand: self.strand = '.' self.fields[ strand_col ] = '.' # Handle feature, score column. self.feature_col = feature_col if self.feature_col >= self.nfields: raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) self.feature = self.fields[ self.feature_col ] self.score_col = score_col if self.score_col >= self.nfields: raise MissingFieldError( "No field for score_col (%d)" % score_col ) self.score = self.fields[ self.score_col ] # GFF attributes. self.attributes = parse_gff_attributes( fields[8] )
def parse_row( self, line ): # HACK: this should return a GFF interval, but bx-python operations # require GenomicInterval objects and subclasses will not work. interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, self.strand_col, self.default_strand, fix_strand=self.fix_strand ) interval = convert_gff_coords_to_bed( interval ) return interval
def complement(reader, lens): # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when # the bitsets are being created by skipping the problem lines complement_reader = BitsetSafeReaderWrapper(reader, lens=lens) bitsets = complement_reader.binned_bitsets(upstream_pad=0, downstream_pad=0, lens=lens) # NOT them all for key, value in bitsets.items(): value.invert() # Read remaining intervals and subtract for chrom in bitsets: bitset = bitsets[chrom] out_intervals = bits_set_in_range(bitset, 0, lens.get(chrom, MAX)) try: # Write the intervals for start, end in out_intervals: fields = [ "." for x in range( max(complement_reader.chrom_col, complement_reader. start_col, complement_reader.end_col) + 1) ] # default the column to a + if it exists if complement_reader.strand_col < len( fields) and complement_reader.strand_col >= 0: fields[complement_reader.strand_col] = "+" fields[complement_reader.chrom_col] = chrom fields[complement_reader.start_col] = start fields[complement_reader.end_col] = end new_interval = GenomicInterval(complement_reader, fields, complement_reader.chrom_col, complement_reader.start_col, complement_reader.end_col, complement_reader.strand_col, "+") yield new_interval except IndexError as e: complement_reader.skipped += 1 # no reason to stuff an entire bad file into memmory if complement_reader.skipped < 10: complement_reader.skipped_lines.append( (complement_reader.linenum, complement_reader.current_line, str(e))) continue
# If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception, exc: print >> sys.stderr, str(exc) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close()
def main(): distance = 0 minregions = 2 output = 1 options, args = doc_optparse.parse(__doc__) try: chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1) if options.distance: distance = int(options.distance) if options.overlap: distance = -1 * int(options.overlap) if options.output: output = int(options.output) if options.minregions: minregions = int(options.minregions) in_fname, out_fname = args except: doc_optparse.exception() g1 = NiceReaderWrapper(fileinput.FileInput(in_fname), chrom_col=chr_col_1, start_col=start_col_1, end_col=end_col_1, strand_col=strand_col_1, fix_strand=True) # Get the cluster tree try: clusters, extra = find_clusters(g1, mincols=distance, minregions=minregions) except ParseError as exc: fail("Invalid file format: %s" % str(exc)) f1 = open(in_fname, "r") out_file = open(out_fname, "w") # If "merge" if output == 1: fields = [ "." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1) ] for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): fields[g1.chrom_col] = chrom fields[g1.start_col] = str(start) fields[g1.end_col] = str(end) out_file.write("%s\n" % "\t".join(fields)) # If "filtered" we preserve order of file and comments, etc. if output == 2: linenums = dict() for chrom, tree in clusters.items(): for linenum in tree.getlines(): linenums[linenum] = 0 linenum = -1 f1.seek(0) for line in f1.readlines(): linenum += 1 if linenum in linenums or linenum in extra: out_file.write("%s\n" % line.rstrip("\n\r")) # If "clustered" we output original intervals, but near each other (i.e. clustered) if output == 3: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for linenum in tree.getlines(): out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r")) # If "minimum" we output the smallest interval in each cluster if output == 4 or output == 5: linenums = list() f1.seek(0) fileLines = f1.readlines() for chrom, tree in clusters.items(): for start, end, lines in tree.getregions(): outsize = -1 outinterval = None for line in lines: # three nested for loops? # should only execute this code once per line fileline = fileLines[line].rstrip("\n\r") try: cluster_interval = GenomicInterval( g1, fileline.split("\t"), g1.chrom_col, g1.start_col, g1.end_col, g1.strand_col, g1.default_strand, g1.fix_strand) except Exception as exc: print(str(exc), file=sys.stderr) f1.close() sys.exit() interval_size = cluster_interval.end - cluster_interval.start if outsize == -1 or \ ( outsize > interval_size and output == 4 ) or \ ( outsize < interval_size and output == 5 ): outinterval = cluster_interval outsize = interval_size out_file.write("%s\n" % outinterval) f1.close() out_file.close() if g1.skipped > 0: print(skipped(g1, filedesc=""))