Exemple #1
0
    def __init__( self, reader, fields, chrom_col=0, feature_col=2, start_col=3, end_col=4,
                  strand_col=6, score_col=5, default_strand='.', fix_strand=False ):
        # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this,
        # temporarily set strand and then unset after initing GenomicInterval.
        unknown_strand = False
        if not fix_strand and fields[ strand_col ] == '.':
            unknown_strand = True
            fields[ strand_col ] = '+'
        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col,
                                  default_strand, fix_strand=fix_strand )
        if unknown_strand:
            self.strand = '.'
            self.fields[ strand_col ] = '.'

        # Handle feature, score column.
        self.feature_col = feature_col
        if self.feature_col >= self.nfields:
            raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
        self.feature = self.fields[ self.feature_col ]
        self.score_col = score_col
        if self.score_col >= self.nfields:
            raise MissingFieldError( "No field for score_col (%d)" % score_col )
        self.score = self.fields[ self.score_col ]

        # GFF attributes.
        self.attributes = parse_gff_attributes( fields[8] )
Exemple #2
0
 def parse_row( self, line ):
     # HACK: this should return a GFF interval, but bx-python operations
     # require GenomicInterval objects and subclasses will not work.
     interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col,
                                 self.end_col, self.strand_col, self.default_strand,
                                 fix_strand=self.fix_strand )
     interval = convert_gff_coords_to_bed( interval )
     return interval
Exemple #3
0
def complement(reader, lens):
    # Handle any ValueError, IndexError and OverflowError exceptions that may be thrown when
    # the bitsets are being created by skipping the problem lines
    complement_reader = BitsetSafeReaderWrapper(reader, lens=lens)
    bitsets = complement_reader.binned_bitsets(upstream_pad=0,
                                               downstream_pad=0,
                                               lens=lens)
    # NOT them all
    for key, value in bitsets.items():
        value.invert()
    # Read remaining intervals and subtract
    for chrom in bitsets:
        bitset = bitsets[chrom]
        out_intervals = bits_set_in_range(bitset, 0, lens.get(chrom, MAX))
        try:
            # Write the intervals
            for start, end in out_intervals:
                fields = [
                    "." for x in range(
                        max(complement_reader.chrom_col, complement_reader.
                            start_col, complement_reader.end_col) + 1)
                ]
                # default the column to a + if it exists
                if complement_reader.strand_col < len(
                        fields) and complement_reader.strand_col >= 0:
                    fields[complement_reader.strand_col] = "+"
                fields[complement_reader.chrom_col] = chrom
                fields[complement_reader.start_col] = start
                fields[complement_reader.end_col] = end
                new_interval = GenomicInterval(complement_reader, fields,
                                               complement_reader.chrom_col,
                                               complement_reader.start_col,
                                               complement_reader.end_col,
                                               complement_reader.strand_col,
                                               "+")
                yield new_interval
        except IndexError as e:
            complement_reader.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if complement_reader.skipped < 10:
                complement_reader.skipped_lines.append(
                    (complement_reader.linenum, complement_reader.current_line,
                     str(e)))
            continue
Exemple #4
0
    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval(
                            g1, fileline.split("\t"), g1.chrom_col,
                            g1.start_col, g1.end_col, g1.strand_col,
                            g1.default_strand, g1.fix_strand)
                    except Exception, exc:
                        print >> sys.stderr, str(exc)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write("%s\n" % outinterval)

    f1.close()
    out_file.close()
Exemple #5
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    f1 = open(in_fname, "r")
    out_file = open(out_fname, "w")

    # If "merge"
    if output == 1:
        fields = [
            "."
            for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)
        ]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write("%s\n" % "\t".join(fields))

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write("%s\n" % line.rstrip("\n\r"))

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r"))

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval(
                            g1, fileline.split("\t"), g1.chrom_col,
                            g1.start_col, g1.end_col, g1.strand_col,
                            g1.default_strand, g1.fix_strand)
                    except Exception as exc:
                        print(str(exc), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write("%s\n" % outinterval)

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=""))