def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #2
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in coverage([g1, g2]):
            if type(line) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(line.fields))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Beispiel #3
0
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        pieces = bool( options.pieces )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    if in1_gff_format:
        # Intersect requires coordinates in BED format.
        g1.convert_to_bed_coord = True
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )
    if in2_gff_format:
        # Intersect requires coordinates in BED format.
        g2.convert_to_bed_coord = True

    out_file = open( out_fname, "w" )
    try:
        for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ):
            if isinstance( feature, GFFFeature ):
                # Convert back to GFF coordinates since reader converted automatically.
                convert_bed_coords_to_gff( feature )
                for interval in feature.intervals:
                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
            elif isinstance( feature, GenomicInterval ):
                out_file.write( "%s\n" % "\t".join( feature.fields ) )
            else:
                out_file.write( "%s\n" % feature )
    except ParseError, e:
        out_file.close()
        fail( "Invalid file format: %s" % str( e ) )
Beispiel #4
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in coverage( [g1, g2] ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #6
0
    def subtract_files(cls, fn1, fn2, out_fn):
        g1 = NiceReaderWrapper(fileinput.FileInput(fn1), fix_strand=True)
        g2 = NiceReaderWrapper(fileinput.FileInput(fn2), fix_strand=True)

        out_file = open(out_fn, "w")
        try:
            for feature in subtract([g1, g2], pieces=True, mincols=1):
                out_file.write("%s\n" % feature)
        except ParseError, exc:
            out_file.close()
            fail("Invalid file format: %s" % str(exc))
Beispiel #7
0
def main():
    mincols = 1
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols:
            mincols = int(options.mincols)
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Beispiel #8
0
def main():
    mincols = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.mincols:
            mincols = int(options.mincols)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type(line) is GenomicInterval:
                    out_file.write(
                        "%s\t%s\t%s\n" %
                        (line.chrom, str(line.startCol), str(line.endCol)))
                elif type(line) is list:
                    out_file.write("%s\t%s\t%s\n" %
                                   (line[chr_col_1], str(line[start_col_1]),
                                    str(line[end_col_1])))
                else:
                    out_file.write("%s\n" % line)
            else:
                if type(line) is GenomicInterval:
                    out_file.write("%s\n" % "\t".join(line.fields))
                elif type(line) is list:
                    out_file.write("%s\n" % "\t".join(line))
                else:
                    out_file.write("%s\n" % line)
    except ParseError as exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))

    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
Beispiel #9
0
def main():
    sameformat = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.sameformat:
            sameformat = True
        in_file_1, in_file_2, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in concat( [g1, g2], sameformat=sameformat ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
Beispiel #10
0
def main():
    mincols = 1
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill):
            if type( outfields ) is list:
                out_file.write( "%s\n" % "\t".join( outfields ) )
            else:
                out_file.write( "%s\n" % outfields )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.mincols:
            mincols = int( options.mincols )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) )
                elif type( line ) is list:
                    out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) )
                else:
                    out_file.write( "%s\n" % line )
            else:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\n" % "\t".join( line.fields ) )
                elif type( line ) is list:
                    out_file.write( "%s\n" % "\t".join( line ) )
                else:
                    out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Beispiel #13
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )
    out_file = open( out_fname, "w" )
    out_file.write( "%s\n" % str( bases ) )
    out_file.close()
    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    f1 = open( in_fname, "r" )
    out_file = open( out_fname, "w" )

    # If "merge"
    if output == 1:
        fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write( "%s\n" % "\t".join( fields ) )

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval( g1, fileline.split("\t"),
                                                            g1.chrom_col,
                                                            g1.start_col,
                                                            g1.end_col,
                                                            g1.strand_col,
                                                            g1.default_strand,
                                                            g1.fix_strand )
                    except Exception as exc:
                        print(str( exc ), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write( "%s\n" % outinterval )

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Beispiel #16
0
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill):
            if type( outfields ) is list:
                out_file.write( "%s\n" % "\t".join( outfields ) )
            else:
                out_file.write( "%s\n" % outfields )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
    except MemoryError:
        out_file.close()
        fail( "Input datasets were too large to complete the join operation." )

    out_file.close()

    if g1.skipped > 0:
        print skipped( g1, filedesc=" of 1st dataset" )
    if g2.skipped > 0:
        print skipped( g2, filedesc=" of 2nd dataset" )

if __name__ == "__main__":
    main()
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )

    # Find flanking features.
    out_file = open( out_fname, "w" )
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type( result ) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff( line )
                    closest_feature = convert_bed_coords_to_gff( closest_feature )

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write( "%s closest_feature \"%s\" \n" %
                                    ( "\t".join( line.fields ),
                                      "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
                                      ) )
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend( line.fields )
                    output_line_fields.extend( closest_feature.fields )
                    out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
            else:
                out_file.write( "%s\n" % result )
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    print("Direction: %s" % (direction))
    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
Beispiel #18
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in1_gff_format = bool(options.gff1)
        in2_gff_format = bool(options.gff2)
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper(fileinput.FileInput(in_fname),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True)
    g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True)

    # Find flanking features.
    out_file = open(out_fname, "w")
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type(result) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff(line)
                    closest_feature = convert_bed_coords_to_gff(
                        closest_feature)

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write(
                        "%s closest_feature \"%s\" \n" %
                        ("\t".join(line.fields), "\t".join(
                            closest_feature.fields).replace("\"", "\\\"")))
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend(line.fields)
                    output_line_fields.extend(closest_feature.fields)
                    out_file.write("%s\n" % ("\t".join(output_line_fields)))
            else:
                out_file.write("%s\n" % result)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    print("Direction: %s" % (direction))
    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
    if g2.skipped > 0:
        print(skipped(g2, filedesc=" of 2nd dataset"))
Beispiel #19
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    f1 = open(in_fname, "r")
    out_file = open(out_fname, "w")

    # If "merge"
    if output == 1:
        fields = [
            "."
            for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)
        ]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write("%s\n" % "\t".join(fields))

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write("%s\n" % line.rstrip("\n\r"))

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r"))

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval(
                            g1, fileline.split("\t"), g1.chrom_col,
                            g1.start_col, g1.end_col, g1.strand_col,
                            g1.default_strand, g1.fix_strand)
                    except Exception as exc:
                        print(str(exc), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write("%s\n" % outinterval)

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=""))
Beispiel #20
0
def main():
    allchroms = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput(lengths)

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open(out_fname, "w")

    try:
        for interval in generator:
            if type(interval) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(interval))
            else:
                out_file.write("%s\n" % interval)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
def main():
    allchroms = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput( lengths )

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open( out_fname, "w" )

    try:
        for interval in generator:
            if type( interval ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( interval ) )
            else:
                out_file.write( "%s\n" % interval )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Beispiel #22
0
    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
    except MemoryError:
        out_file.close()
        fail("Input datasets were too large to complete the join operation.")

    out_file.close()

    if g1.skipped > 0:
        print skipped(g1, filedesc=" of 1st dataset")
    if g2.skipped > 0:
        print skipped(g2, filedesc=" of 2nd dataset")


if __name__ == "__main__":
    main()