Esempio n. 1
0
def main():

    options, args = doc_optparse.parse(__doc__)
    try:
        sources = args[0].translate(tree_tx).split()
        ref_2bit = bx.seq.twobit.TwoBitFile(open(args[1]))
        index = maf.MultiIndexed(args[2:])

        out = maf.Writer(sys.stdout)
        missing_data = bool(options.missingData)
        use_strand = bool(options.strand)
    except:
        doc_optparse.exception()

    for line in sys.stdin:
        fields = line.split()
        ref_src, start, end = fields[0:3]
        if use_strand and len(fields) > 5:
            strand = fields[5]
        else:
            strand = '+'
        do_interval(sources, index, out, ref_src, int(start), int(end),
                    ref_2bit, missing_data, strand)

    out.close()
Esempio n. 2
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)
    out_file = open(out_fname, "w")
    try:
        for line in proximal_region_finder([g1, g2], direction):
            if type(line) is list:
                out_file.write("%s\n" % "\t".join(line))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
Esempio n. 3
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        data_fname, model_fname, out_fname = args
        window = int(getopt(options, "window", 100))
        shift = int(getopt(options, "shift", 5))
        low = float(getopt(options, "low", -1.0))
        high = float(getopt(options, "high", 1.0))
        if options.mapping:
            align_count, mapping = rp.mapping.alignment_mapping_from_file(file(options.mapping))
        else:
            mapping = None
        modname = getattr(options, "model")
        if modname is None:
            modname = "standard"
        reorder = getopt(options, "reorder", None)
        if reorder:
            reorder = map(int, reorder.split(","))
    except:
        doc_optparse.exception()

    out = open(out_fname, "w")
    run(open(data_fname), modname, open(model_fname), out, mapping, window, shift, low, high, reorder)
    out.close()
Esempio n. 4
0
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.mincols:
            mincols = int( options.mincols )
        pieces = bool( options.pieces )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    if in1_gff_format:
        # Intersect requires coordinates in BED format.
        g1.convert_to_bed_coord = True
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )
    if in2_gff_format:
        # Intersect requires coordinates in BED format.
        g2.convert_to_bed_coord = True

    out_file = open( out_fname, "w" )
    try:
        for feature in intersect( [g1, g2], pieces=pieces, mincols=mincols ):
            if isinstance( feature, GFFFeature ):
                # Convert back to GFF coordinates since reader converted automatically.
                convert_bed_coords_to_gff( feature )
                for interval in feature.intervals:
                    out_file.write( "%s\n" % "\t".join( interval.fields ) )
            elif isinstance( feature, GenomicInterval ):
                out_file.write( "%s\n" % "\t".join( feature.fields ) )
            else:
                out_file.write( "%s\n" % feature )
    except ParseError, e:
        out_file.close()
        fail( "Invalid file format: %s" % str( e ) )
Esempio n. 5
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in coverage( [g1, g2] ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 6
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )
    out_file = open( out_fname, "w" )
    try:
        for line in proximal_region_finder([g1,g2], direction):
            if type( line ) is list:
                out_file.write( "%s\n" % "\t".join( line ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 7
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )
    try:
        keep_header = bool( options.header )
        keep_comments = bool( options.comments )
        expr = args[0]
        colname = args[1]
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr: expr = compile( expr, '<expr arg>', 'eval' )

    for element in bx.tabular.io.Reader( sys.stdin ):
        if type( element ) is bx.tabular.io.Header:
            if keep_header: 
                print str( element ) + "\t" + colname
        elif type( element ) is bx.tabular.io.Comment:
            if keep_comments: 
                print element
        else:
            val = eval( expr, dict( row=element ) )
            print str( element ) + "\t" + str( val )
Esempio n. 9
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 10
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in coverage([g1, g2]):
            if type(line) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(line.fields))
            else:
                out_file.write("%s\n" % line)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        expr = args[0]
        colname = args[1]
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr: expr = compile(expr, '<expr arg>', 'eval')

    for element in bx.tabular.io.Reader(sys.stdin):
        if type(element) is bx.tabular.io.Header:
            if keep_header:
                print str(element) + "\t" + colname
        elif type(element) is bx.tabular.io.Comment:
            if keep_comments:
                print element
        else:
            val = eval(expr, dict(row=element))
            print str(element) + "\t" + str(val)
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception, e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
Esempio n. 13
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
Esempio n. 14
0
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )

    try:
        maf_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if maf_file.endswith( ".bz2" ):
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableBzip2File( maf_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            maf_file = maf_file[:-4]
        elif maf_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = maf_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with lzop_build_offset_table." )
            # Open with SeekableBzip2File so we have tell support
            maf_in = SeekableLzopFile( maf_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            maf_file = maf_file[:-4]
        else:
            maf_in = open( maf_file )
        # Determine the name of the index file
        if len( args ) > 1: 
            index_file = args[1]
        else: 
            index_file = maf_file + ".index" 
        if options.species:
            species = options.species.split( "," )
        else:
            species = None
    except:
        doc_optparse.exception()

    maf_reader = bx.align.maf.Reader( maf_in )

    indexes = interval_index_file.Indexes()

    # Need to be a bit tricky in our iteration here to get the 'tells' right
    while 1:
        pos = maf_reader.file.tell()
        block = maf_reader.next()
        if block is None: break
        for c in block.components:
            if species is not None and c.src.split('.')[0] not in species:
                continue
            indexes.add( c.src, c.forward_strand_start, c.forward_strand_end, pos, max=c.src_size )

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
def main():
    # Parse command line
    options, args = doc_optparse.parse( __doc__ )
    try:
        h5_fname = args[0]
        mapping_fname = args[1]
        in_fname = args[2]
        out_fname = args[3]
        chrom_col, start_col, end_col = map( lambda x: int( x ) - 1, args[4:7] )
        per_col = bool( options.perCol )
    except Exception, e:
        doc_optparse.exception()
Esempio n. 16
0
def main():
    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        h5_fname = args[0]
        mapping_fname = args[1]
        in_fname = args[2]
        out_fname = args[3]
        chrom_col, start_col, end_col = map(lambda x: int(x) - 1, args[4:7])
        per_col = bool(options.perCol)
    except Exception, e:
        doc_optparse.exception()
Esempio n. 17
0
def main():
    mincols = 1
    upstream_pad = 0
    downstream_pad = 0
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols: mincols = int(options.mincols)
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
Esempio n. 18
0
def main():
    mincols = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.mincols:
            mincols = int(options.mincols)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type(line) is GenomicInterval:
                    out_file.write(
                        "%s\t%s\t%s\n" %
                        (line.chrom, str(line.startCol), str(line.endCol)))
                elif type(line) is list:
                    out_file.write("%s\t%s\t%s\n" %
                                   (line[chr_col_1], str(line[start_col_1]),
                                    str(line[end_col_1])))
                else:
                    out_file.write("%s\n" % line)
            else:
                if type(line) is GenomicInterval:
                    out_file.write("%s\n" % "\t".join(line.fields))
                elif type(line) is list:
                    out_file.write("%s\n" % "\t".join(line))
                else:
                    out_file.write("%s\n" % line)
    except ParseError as exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))

    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
Esempio n. 19
0
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        cols = []
        if options.cols:
            for c in options.cols.split(','):
                try:
                    v = int(c)
                except ValueError:
                    v = c
                cols.append(v)
        if len(args) > 0:
            expr = args[0]
        else:
            expr = None
        if options.force_header:
            force_header = bx.tabular.io.FIRST_LINE_IS_HEADER
        else:
            force_header = None
    except Exception:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr:
        expr = compile(expr, '<expr arg>', 'eval')

    for element in bx.tabular.io.TableReader(sys.stdin,
                                             force_header=force_header):
        if isinstance(element, bx.tabular.io.Header):
            if keep_header:
                if cols:
                    print("#" + "\t".join(element[c] for c in cols))
                else:
                    print(element)
        elif isinstance(element, bx.tabular.io.Comment):
            if keep_comments:
                print(element)
        else:
            if expr is None or bool(eval(expr, dict(row=element))):
                if cols:
                    print("\t".join(element[c] for c in cols))
                else:
                    print(element)
Esempio n. 20
0
def main():
    mincols = 1
    upstream_pad = 0
    downstream_pad = 0
    leftfill = False
    rightfill = False
    
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )      
        if options.mincols: mincols = int( options.mincols )
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )
    g2 = NiceReaderWrapper( fileinput.FileInput( in2_fname ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for outfields in join(g1, g2, mincols=mincols, rightfill=rightfill, leftfill=leftfill):
            if type( outfields ) is list:
                out_file.write( "%s\n" % "\t".join( outfields ) )
            else:
                out_file.write( "%s\n" % outfields )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 21
0
def main():
    options, args = doc_optparse.parse( __doc__ )

    try:
        extension = options.ext
    except:
        doc_optparse.exception()

    # create datatype
    data = model.Dataset( extension=extension, id=int( args[0] ) )
    data.file_path = "/home/ian/trunk/database/files/"
    
    if options.metadata:
        data.metadata = util.string_to_object( options.metadata )

    errors = data.datatype.validate( data )
    print util.object_to_string(errors)
def __main__():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)
    try:
        keep_header = bool(options.header)
        keep_comments = bool(options.comments)
        cols = []
        if options.cols:
            for c in options.cols.split(","):
                try:
                    v = int(c)
                except:
                    v = c
                cols.append(c)
        if len(args) > 0:
            expr = args[0]
        else:
            expr = None
        if options.force_header:
            force_header = bx.tabular.io.FIRST_LINE_IS_HEADER
        else:
            force_header = None
    except:
        doc_optparse.exception()

    # Compile expression for SPEED
    if expr:
        expr = compile(expr, "<expr arg>", "eval")

    for element in bx.tabular.io.TableReader(sys.stdin, force_header=force_header):
        if type(element) is bx.tabular.io.Header:
            if keep_header:
                if cols:
                    print "#" + "\t".join(element[c] for c in cols)
                else:
                    print element
        elif type(element) is bx.tabular.io.Comment:
            if keep_comments:
                print element
        else:
            if expr is None or bool(eval(expr, dict(row=element))):
                if cols:
                    print "\t".join([element[c] for c in cols])
                else:
                    print element
Esempio n. 23
0
def main():
    sameformat = False

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.sameformat:
            sameformat = True
        in_file_1, in_file_2, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in concat( [g1, g2], sameformat=sameformat ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
Esempio n. 24
0
def main():
    mincols = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.mincols:
            mincols = int( options.mincols )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    out_file = open( out_fname, "w" )

    try:
        for line in merge(g1, mincols=mincols):
            if options.threecol:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\t%s\t%s\n" % ( line.chrom, str( line.startCol ), str( line.endCol ) ) )
                elif type( line ) is list:
                    out_file.write( "%s\t%s\t%s\n" % ( line[chr_col_1], str( line[start_col_1] ), str( line[end_col_1] ) ) )
                else:
                    out_file.write( "%s\n" % line )
            else:
                if type( line ) is GenomicInterval:
                    out_file.write( "%s\n" % "\t".join( line.fields ) )
                elif type( line ) is list:
                    out_file.write( "%s\n" % "\t".join( line ) )
                else:
                    out_file.write( "%s\n" % line )
    except ParseError as exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )

    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
Esempio n. 25
0
def main():

    options, args = doc_optparse.parse( __doc__ )
    try:
        sources = args[0].translate( tree_tx ).split()
        seq_db = load_seq_db( args[1] )
        index = bx.align.maf.MultiIndexed( args[2:] )

        out = bx.align.maf.Writer( sys.stdout )
        missing_data = bool(options.missingData)
    except:
        doc_optparse.exception()

    for line in sys.stdin:
        ref_src, start, end = line.split()[0:3]
        do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data )

    out.close()
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse(__doc__)

    try:
        template = Template(args[0])
        format = options.format
        if not format: format = "maf"
    except:
        doc_optparse.exception()

    reader = align.get_reader(format, sys.stdin)

    for a in reader:
        template.a = a
        template.c = a.components
        print template
Esempio n. 27
0
def main():

    # Parse command line arguments
    options, args = doc_optparse.parse( __doc__ )

    try:
        template = Template( args[0] )
        format = options.format
        if not format: format = "maf"
    except:
        doc_optparse.exception()

    reader = align.get_reader( format, sys.stdin ) 

    for a in reader: 
        template.a = a
        template.c = a.components
        print template
Esempio n. 28
0
def main():

    options, args = doc_optparse.parse( __doc__ )
    try:
        sources = args[0].translate( tree_tx ).split()
        seq_db = load_seq_db( args[1] )
        index = bx.align.maf.MultiIndexed( args[2:] )

        out = bx.align.maf.Writer( sys.stdout )
        missing_data = bool(options.missingData)
    except:
        doc_optparse.exception()

    for line in sys.stdin:
        ref_src, start, end = line.split()[0:3]
        do_interval( sources, index, out, ref_src, int( start ), int( end ), seq_db, missing_data )

    out.close()
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 30
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.mask:
            mask = options.mask
        else:
            mask = "?"
    except:
        doc_optparse.exception()

    reader = bx.align.maf.Reader( sys.stdin )
    writer = bx.align.maf.Writer( sys.stdout )

    if options.restricted:
        cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask )
    else:
        cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask )
    cpgfilter.run( reader, writer.write )

    print >> sys.stderr, str( float(cpgfilter.masked)/float(cpgfilter.total) * 100 ) + "% bases masked."
Esempio n. 31
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        if options.mask:
            mask = options.mask
        else:
            mask = "?"
    except:
        doc_optparse.exception()

    reader = bx.align.maf.Reader( sys.stdin )
    writer = bx.align.maf.Writer( sys.stdout )

    if options.restricted:
        cpgfilter = bx.align.sitemask.cpg.Restricted( mask=mask )
    else:
        cpgfilter = bx.align.sitemask.cpg.Inclusive( mask=mask )
    cpgfilter.run( reader, writer.write )

    print >> sys.stderr, str( float(cpgfilter.masked)/float(cpgfilter.total) * 100 ) + "% bases masked."
Esempio n. 32
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    try:
        bases = base_coverage(g1)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
Esempio n. 33
0
def main():
    mincols = 1
    upstream_pad = 0
    downstream_pad = 0

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols: mincols = int(options.mincols)
        pieces = bool(options.pieces)
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for line in intersect([g1, g2], pieces=pieces, mincols=mincols):
            if type(line) == GenomicInterval:
                out_file.write("%s\n" % "\t".join(line.fields))
            else:
                out_file.write("%s\n" % line)
    except ParseError, e:
        out_file.close()
        fail("Invalid file format: %s" % str(e))
Esempio n. 34
0
def main():
    sameformat=False
    upstream_pad = 0
    downstream_pad = 0

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        if options.sameformat: sameformat = True
        in_file_1, in_file_2, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_file_1 ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            fix_strand=True )

    g2 = NiceReaderWrapper( fileinput.FileInput( in_file_2 ),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True )

    if strand_col_1 >= 0:
        g1.strand_col = strand_col_1

    out_file = open( out_fname, "w" )

    try:
        for line in concat( [g1, g2], sameformat=sameformat ):
            if type( line ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( line.fields ) )
            else:
                out_file.write( "%s\n" % line )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 35
0
def main():

    options, args = doc_optparse.parse(__doc__)
    try:
        sources = args[0].translate(tree_tx).split()
        seq_db = load_seq_db(args[1])
        index = maf.MultiIndexed(args[2:])

        out = maf.Writer(sys.stdout)
        missing_data = bool(options.missingData)
        use_strand = bool(options.strand)
    except:
        doc_optparse.exception()

    for line in sys.stdin:
        fields = line.split()
        ref_src, start, end = fields[0:3]
        if use_strand and len(fields) > 5:
            strand = fields[5]
        else:
            strand = "+"
        do_interval(sources, index, out, ref_src, int(start), int(end), seq_db, missing_data, strand)

    out.close()
Esempio n. 36
0
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    try:
        bases = base_coverage(g1)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )
    out_file = open( out_fname, "w" )
    out_file.write( "%s\n" % str( bases ) )
    out_file.close()
    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Esempio n. 37
0
def main():
    
    # Parsing Command Line here
    options, args = doc_optparse.parse( __doc__ )

    try:
        inp1_file, inp2_file, begin_col, end_col, out_file = args
    except:
        doc_optparse.exception()
    
    begin_col = begin_col.strip()
    end_col = end_col.strip()
    
    if begin_col != 'None' or end_col != 'None':
        """
        The user selected columns for restriction.  We'll allow default
        values for both begin_col and end_col as long as the user selected
        at least one of them for restriction.
        """
        if begin_col == 'None':
            begin_col = end_col
        elif end_col == 'None':
            end_col = begin_col
        begin_col = int(begin_col)
        end_col = int(end_col)
        """Make sure that begin_col <= end_col (switch if not)"""
        if begin_col > end_col:
            tmp_col = end_col
            end_col = begin_col
            begin_col = tmp_col
    else:
        begin_col = end_col = ''

    try:
        fo = open(out_file,'w')
    except:
        print >> sys.stderr, "Unable to open output file"
        sys.exit()

    """
    len1 is the number of lines in inp1_file
    lines1 is the set of unique lines in inp1_file
    diff1 is the number of duplicate lines removed from inp1_file
    """
    len1, lines1 = get_lines(inp1_file, begin_col, end_col, options.ignore_empty_end_cols)
    diff1 = len1 - len(lines1)
    len2, lines2 = get_lines(inp2_file, begin_col, end_col, options.ignore_empty_end_cols)
    
    lines1.difference_update(lines2)
    """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file"""

    for line in lines1:
        print >> fo, line

    fo.close()
    
    info_msg = 'Subtracted %d lines. ' %((len1 - diff1) - len(lines1))
    
    if begin_col and end_col:
        info_msg += 'Restricted to columns c' + str(begin_col) + ' thru c' + str(end_col) + '. '

    if diff1 > 0:
        info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' %diff1
    
    print info_msg
Esempio n. 38
0
def main():

    # Parse command line

    options, args = doc_optparse.parse(__doc__)
    if options.version: return

    try:
        wiggle_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if wiggle_file.endswith(".bz2"):
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index bz2 compressed files first "
                                  "create a bz2t file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableBzip2File(wiggle_file, table_file)
            # Strip .bz2 from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        elif wiggle_file.endswith(".lzo"):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = wiggle_file + "t"
            if not os.path.exists(table_file):
                doc_optparse.exit("To index lzo compressed files first "
                                  "create a lzot file with bzip-table.")
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableLzopFile(wiggle_file, table_file)
            # Strip .lzo from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        else:
            wiggle_in = open(wiggle_file)
        # Determine the name of the index file
        if len(args) > 1:
            index_file = args[1]
        else:
            index_file = wiggle_file + ".index"
    except:
        doc_optparse.exception()

    indexes = interval_index_file.Indexes()

    # Can't use the iterator, as there is no next() and thus
    # no way to access the positions. The following code is
    # modified from wiggle.py
    last_chrom = None
    start = None
    end = None
    first_pos = None

    # always for wiggle data
    strand = '+'

    mode = "bed"

    while 1:
        pos = wiggle_in.tell()
        line = wiggle_in.readline()
        if not line: break

        if line.isspace() or line.startswith("track") or line.startswith(
                "#") or line.startswith("browser"):
            continue
        elif line.startswith("bed"):
            indexes.add(fields[0], int(fields[1]), int(fields[2]), pos)
        elif line.startswith("variableStep") or line.startswith("fixedStep"):
            if first_pos != None:
                indexes.add(last_chrom, start, end, first_pos)
            first_pos = pos
            header = bx.wiggle.parse_header(line)
            last_chrom = header['chrom']
            start = int(header['start']) - 1
            end = start
            current_step = None
            if 'span' in header:
                current_span = int(header['span'])
            else:
                current_span = 1
            if 'step' in header:
                current_step = int(header['step'])

            if line.startswith("variableStep"):
                mode = "variableStep"
            else:
                mode = "fixedStep"
        elif mode == "variableStep":
            fields = line.split()
            end = int(fields[0]) - 1 + current_span
        elif mode == "fixedStep":
            end += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()

    out = open(index_file, 'w')
    indexes.write(out)
    out.close()
Esempio n. 39
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        if options.distance:
            distance = int(options.distance)
        if options.overlap:
            distance = -1 * int(options.overlap)
        if options.output:
            output = int(options.output)
        if options.minregions:
            minregions = int(options.minregions)
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    # Get the cluster tree
    try:
        clusters, extra = find_clusters(g1,
                                        mincols=distance,
                                        minregions=minregions)
    except ParseError as exc:
        fail("Invalid file format: %s" % str(exc))

    f1 = open(in_fname, "r")
    out_file = open(out_fname, "w")

    # If "merge"
    if output == 1:
        fields = [
            "."
            for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)
        ]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write("%s\n" % "\t".join(fields))

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write("%s\n" % line.rstrip("\n\r"))

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write("%s\n" % fileLines[linenum].rstrip("\n\r"))

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval(
                            g1, fileline.split("\t"), g1.chrom_col,
                            g1.start_col, g1.end_col, g1.strand_col,
                            g1.default_strand, g1.fix_strand)
                    except Exception as exc:
                        print(str(exc), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write("%s\n" % outinterval)

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=""))
Esempio n. 40
0
def main():
    
    options, args = doc_optparse.parse( __doc__ )
    try:
        inputformat = options.input
        outputformat = options.output
        mask = options.mask
        minqual = int(options.quality)
        qtype = options.type
        speciesAndLens = options.list
        inputfile = args[0]
        outputfile = args[1]
    except:
        doc_optparse.exception()

    outstream = open( outputfile, "w" )
    instream = open( inputfile, "r" )

    qualfiles = {}

    # read lens
    specieslist = speciesAndLens.split(":")
    species_to_lengths = {}
    
    for entry in specieslist:
        fields = entry.split(",")
        lenstream = fileinput.FileInput( fields[1] )
        lendict = dict()
        for line in lenstream:
            region = line.split()
            lendict[region[0]] = int(region[1])
        species_to_lengths[fields[0]] = lendict
        if len(fields) >= 3:
            qualfiles[fields[0]] = fields[2]

    specieslist = map( lambda(a): a.split(":")[0], specieslist )
    
    # open quality binned_arrays
    reader = None
    writer = None
    
    if inputformat == "axt":
        # load axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        reader = bx.align.axt.Reader(instream, species1=specieslist[0], \
                                     species2=specieslist[1], \
                                     species_to_lengths = species_to_lengths)
    elif outputformat == "maf":
        # load maf
        reader = bx.align.maf.Reader(instream, species_to_lengths=species_to_lengths)

    if outputformat == "axt":
        # setup axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        writer = bx.align.axt.Writer(outstream, attributes=reader.attributes)
    elif outputformat == "maf":
        # setup maf
        writer = bx.align.maf.Writer(outstream, attributes=reader.attributes)

    qualfilter = Simple( mask=mask, qualspecies = species_to_lengths, \
                         qualfiles = qualfiles, minqual = minqual, cache=50 )

    qualfilter.run( reader, writer.write )

    print "For "+str(qualfilter.total)+" base pairs, "+str(qualfilter.masked)+" base pairs were masked."
    print str(float(qualfilter.masked)/float(qualfilter.total) * 100)+"%"
Esempio n. 41
0
def main():
    distance = 0
    minregions = 2
    output = 1

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        if options.distance:
            distance = int( options.distance )
        if options.overlap:
            distance = -1 * int( options.overlap )
        if options.output:
            output = int( options.output )
        if options.minregions:
            minregions = int( options.minregions )
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    # Get the cluster tree
    try:
        clusters, extra = find_clusters( g1, mincols=distance, minregions=minregions)
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    f1 = open( in_fname, "r" )
    out_file = open( out_fname, "w" )

    # If "merge"
    if output == 1:
        fields = ["." for x in range(max(g1.chrom_col, g1.start_col, g1.end_col) + 1)]
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                fields[g1.chrom_col] = chrom
                fields[g1.start_col] = str(start)
                fields[g1.end_col] = str(end)
                out_file.write( "%s\n" % "\t".join( fields ) )

    # If "filtered" we preserve order of file and comments, etc.
    if output == 2:
        linenums = dict()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                linenums[linenum] = 0
        linenum = -1
        f1.seek(0)
        for line in f1.readlines():
            linenum += 1
            if linenum in linenums or linenum in extra:
                out_file.write( "%s\n" % line.rstrip( "\n\r" ) )

    # If "clustered" we output original intervals, but near each other (i.e. clustered)
    if output == 3:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for linenum in tree.getlines():
                out_file.write( "%s\n" % fileLines[linenum].rstrip( "\n\r" ) )

    # If "minimum" we output the smallest interval in each cluster
    if output == 4 or output == 5:
        linenums = list()
        f1.seek(0)
        fileLines = f1.readlines()
        for chrom, tree in clusters.items():
            for start, end, lines in tree.getregions():
                outsize = -1
                outinterval = None
                for line in lines:
                    # three nested for loops?
                    # should only execute this code once per line
                    fileline = fileLines[line].rstrip("\n\r")
                    try:
                        cluster_interval = GenomicInterval( g1, fileline.split("\t"),
                                                            g1.chrom_col,
                                                            g1.start_col,
                                                            g1.end_col,
                                                            g1.strand_col,
                                                            g1.default_strand,
                                                            g1.fix_strand )
                    except Exception as exc:
                        print(str( exc ), file=sys.stderr)
                        f1.close()
                        sys.exit()
                    interval_size = cluster_interval.end - cluster_interval.start
                    if outsize == -1 or \
                       ( outsize > interval_size and output == 4 ) or \
                       ( outsize < interval_size and output == 5 ):
                        outinterval = cluster_interval
                        outsize = interval_size
                out_file.write( "%s\n" % outinterval )

    f1.close()
    out_file.close()

    if g1.skipped > 0:
        print(skipped( g1, filedesc="" ))
Esempio n. 42
0
def main():

    options, args = doc_optparse.parse(__doc__)
    try:
        inputformat = options.input
        outputformat = options.output
        mask = options.mask
        minqual = int(options.quality)
        qtype = options.type
        speciesAndLens = options.list
        inputfile = args[0]
        outputfile = args[1]
    except:
        doc_optparse.exception()

    outstream = open(outputfile, "w")
    instream = open(inputfile, "r")

    qualfiles = {}

    # read lens
    specieslist = speciesAndLens.split(":")
    species_to_lengths = {}

    for entry in specieslist:
        fields = entry.split(",")
        lenstream = fileinput.FileInput(fields[1])
        lendict = dict()
        for line in lenstream:
            region = line.split()
            lendict[region[0]] = int(region[1])
        species_to_lengths[fields[0]] = lendict
        if len(fields) >= 3:
            qualfiles[fields[0]] = fields[2]

    specieslist = map(lambda (a): a.split(":")[0], specieslist)

    # open quality binned_arrays
    reader = None
    writer = None

    if inputformat == "axt":
        # load axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        reader = bx.align.axt.Reader(instream, species1=specieslist[0], \
                                     species2=specieslist[1], \
                                     species_to_lengths = species_to_lengths)
    elif outputformat == "maf":
        # load maf
        reader = bx.align.maf.Reader(instream,
                                     species_to_lengths=species_to_lengths)

    if outputformat == "axt":
        # setup axt
        if len(specieslist) != 2:
            print "AXT is pairwise only."
            sys.exit()
        writer = bx.align.axt.Writer(outstream, attributes=reader.attributes)
    elif outputformat == "maf":
        # setup maf
        writer = bx.align.maf.Writer(outstream, attributes=reader.attributes)

    qualfilter = Simple( mask=mask, qualspecies = species_to_lengths, \
                         qualfiles = qualfiles, minqual = minqual, cache=50 )

    qualfilter.run(reader, writer.write)

    print "For " + str(qualfilter.total) + " base pairs, " + str(
        qualfilter.masked) + " base pairs were masked."
    print str(float(qualfilter.masked) / float(qualfilter.total) * 100) + "%"
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse(__doc__)
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg(
                options.cols)
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg(
                options.cols)
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile(dir=".").name
            cmd = "faToTwoBit %s %s" % (fasta_file, seq_path)

            tmp_name = tempfile.NamedTemporaryFile(dir=".").name
            tmp_stderr = open(tmp_name, 'wb')
            proc = subprocess.Popen(args=cmd,
                                    shell=True,
                                    stderr=tmp_stderr.fileno())
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open(tmp_name, 'rb')
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read(buffsize)
                    if not stderr or len(stderr) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err('Error running faToTwoBit. ' + str(e))
    else:
        seq_path = check_seq_file(dbkey, GALAXY_DATA_INDEX_DIR)
        if not os.path.exists(seq_path):
            # If this occurs, we need to fix the metadata validator.
            stop_err(
                "No sequences are available for '%s', request them by reporting this error."
                % dbkey)

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines(feature):
        if isinstance(feature, gff_util.GFFFeature):
            return feature.lines()
        else:
            return [feature.rstrip('\r\n')]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open(output_filename, "w")
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open(input_filename)
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper(file_iterator,
                                                  fix_strand=False)
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance(feature, (Header, Comment)):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed(feature)
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip('\r\n')
            if line and not line.startswith("#"):
                fields = line.split('\t')
                try:
                    chrom = fields[chrom_col]
                    start = int(fields[start_col])
                    end = int(fields[end_col])
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed(
                            [start, end])
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % (
                        start, end)
                    warnings.append(warning)
                    if not invalid_lines:
                        invalid_lines = get_lines(feature)
                        first_invalid_line = line_count
                    skipped_lines += len(invalid_lines)
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile(
                    open("%s/%s.nib" % (seq_path, chrom)))
            try:
                sequence = nib.get(start, end - start)
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                    start, end - start, dbkey)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        elif seq_path and os.path.isfile(seq_path):
            if not (twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[
                            interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (
                    start, end - start, chrom)
                warnings.append(warning)
                if not invalid_lines:
                    invalid_lines = get_lines(feature)
                    first_invalid_line = line_count
                skipped_lines += len(invalid_lines)
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % (
                chrom, dbkey)
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append(warning)
            if not invalid_lines:
                invalid_lines = get_lines(feature)
                first_invalid_line = line_count
            skipped_lines += len(invalid_lines)
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement(sequence)

        if output_format == "fasta":
            l = len(sequence)
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff([start, end])
            fields = [dbkey, str(chrom), str(start), str(end), strand]
            meta_data = "_".join(fields)
            if name.strip():
                fout.write(">%s %s\n" % (meta_data, name))
            else:
                fout.write(">%s\n" % meta_data)
            while c < l:
                b = min(c + 50, l)
                fout.write("%s\n" % str(sequence[c:b]))
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join([
                    feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str(feature.start),
                    str(feature.end), feature.score, feature.strand, ".",
                    gff_util.gff_attributes_to_str(feature.attributes, "GTF")
                ])
            else:
                meta_data = "\t".join(fields)
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write(format_str % (meta_data, str(sequence)))

        # Update line count.
        if isinstance(feature, gff_util.GFFFeature):
            line_count += len(feature.intervals)
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len(warnings)
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (
            skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))

    # Clean up temp file.
    if fasta_file:
        os.remove(seq_path)
        os.remove(tmp_name)
Esempio n. 44
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        in1_gff_format = bool(options.gff1)
        in2_gff_format = bool(options.gff2)
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper(fileinput.FileInput(in_fname),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True)
    g2 = in2_reader_wrapper(fileinput.FileInput(in2_fname),
                            chrom_col=chr_col_2,
                            start_col=start_col_2,
                            end_col=end_col_2,
                            strand_col=strand_col_2,
                            fix_strand=True)

    # Find flanking features.
    out_file = open(out_fname, "w")
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type(result) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff(line)
                    closest_feature = convert_bed_coords_to_gff(
                        closest_feature)

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write( "%s closest_feature \"%s\" \n" %
                                    ( "\t".join( line.fields ), \
                                      "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
                                     ) )
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend(line.fields)
                    output_line_fields.extend(closest_feature.fields)
                    out_file.write("%s\n" % ("\t".join(output_line_fields)))
            else:
                out_file.write("%s\n" % result)
    except ParseError, exc:
        fail("Invalid file format: %s" % str(exc))
Esempio n. 45
0
def __main__():

    lflank = 0
    rflank = 0

    options, args = doc_optparse.parse(__doc__)
    try:
        chrom_col, start_col, end_col, strand_col = parse_cols_arg(
            options.cols)
        output_format = options.output_format
        seq_path = options.seq_path
        if (options.left_flank): lflank = int(options.left_flank)
        if (options.right_flank): rflank = int(options.right_flank)
        input_filename, output_filename = args
    except:
        doc_optparse.exception()
    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}
    twobits = {}
    if not os.path.exists(seq_path):
        # If this occurs, we need to fix the metadata validator.
        print "No sequences are available for '%s', request them by reporting this error."

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''
    fout = open(output_filename, "w")
    warnings = []
    warning = ''
    twobitfile = None
    dbkey = seq_path

    for i, line in enumerate(open(input_filename)):
        line = line.rstrip('\r\n')
        if line and not line.startswith("#"):
            fields = line.split('\t')
            try:
                chrom = fields[chrom_col]
                ostart = int(fields[start_col])
                oend = int(fields[end_col])
                start = ostart - lflank
                end = oend + rflank
                if includes_strand_col:
                    strand = fields[strand_col]
            except:
                warning = "Invalid chrom, start or end column values. "
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if start > end:
                warning = "Invalid interval, start '%d' > end '%d'.  " % (
                    start, end)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue

            if strand not in ['+', '-']:
                strand = '+'
            sequence = ''

            if seq_path and os.path.exists("%s/%s.nib" % (seq_path, chrom)):
                if chrom in nibs:
                    nib = nibs[chrom]
                else:
                    nibs[chrom] = nib = bx.seq.nib.NibFile(
                        file("%s/%s.nib" % (seq_path, chrom)))
                try:
                    sequence = nib.get(start, end - start)
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                        start, end - start, dbkey)
                    warnings.append(warning)
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            elif seq_path and os.path.isfile(seq_path):
                if not (twobitfile):
                    twobitfile = bx.seq.twobit.TwoBitFile(file(seq_path))
                try:
                    sequence = twobitfile[chrom][start:end]
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (
                        start, end - start, dbkey)
                    warnings.append(warning)
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            else:
                warning = "Chromosome by name '%s' was not found for build '%s'. " % (
                    chrom, dbkey)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if sequence == '':
                warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % (
                    chrom, start, end, dbkey)
                warnings.append(warning)
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if includes_strand_col and strand == "-":
                sequence = reverse_complement(sequence)
            sequence = sequence[0:lflank].lower(
            ) + sequence[lflank:len(sequence) - rflank +
                         1].upper() + sequence[len(sequence) - rflank +
                                               1:len(sequence)].lower()

            if output_format == "fasta":
                l = len(sequence)
                c = 0
                fields = [dbkey, str(chrom), str(ostart), str(oend), strand]
                meta_data = "_".join(fields)
                fout.write(">%s\n" % meta_data)
                while c < l:
                    b = min(c + 50, l)
                    fout.write("%s\n" % str(sequence[c:b]))
                    c = b
            else:  # output_format == "interval"
                meta_data = "\t".join(fields)
                fout.write("%s\t%s\n" % (meta_data, str(sequence)))

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len(warnings)
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % (
            skipped_lines, first_invalid_line, invalid_line)
Esempio n. 46
0
        for position in posgen:
            kwargs['position'] = position
            self.out_stream.write(self.template % kwargs)

    def close(self):
        self.out_stream.flush()
        self.out_stream.close()

if __name__ == "__main__":
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = [int(x) - 1 for x in options.cols1.split(',')]
        chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [int(x) - 1 for x in options.cols2.split(',')]
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Sort through a tempfile first
    temp_file = tempfile.NamedTemporaryFile(mode="r")
    environ['LC_ALL'] = 'POSIX'
    commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (chr_col_1 + 1, start_col_1 + 1, end_col_1 + 1, temp_file.name, in_fname)
    errorcode, stdout = commands.getstatusoutput(commandline)

    coverage = CoverageWriter( out_stream=open(out_fname, "a"),
                               chromCol=chr_col_2, positionCol=position_col_2,
                               forwardCol=forward_col_2, reverseCol=reverse_col_2, )
    temp_file.seek(0)
    interval = io.NiceReaderWrapper( temp_file,
                                     chrom_col=chr_col_1,
                                     start_col=start_col_1,
                                     end_col=end_col_1,
Esempio n. 47
0
def main():

    # Parsing Command Line here
    options, args = doc_optparse.parse(__doc__)

    try:
        inp1_file, inp2_file, begin_col, end_col, out_file = args
    except:
        doc_optparse.exception()

    begin_col = begin_col.strip()
    end_col = end_col.strip()

    if begin_col != 'None' or end_col != 'None':
        """
        The user selected columns for restriction.  We'll allow default
        values for both begin_col and end_col as long as the user selected
        at least one of them for restriction.
        """
        if begin_col == 'None':
            begin_col = end_col
        elif end_col == 'None':
            end_col = begin_col
        begin_col = int(begin_col)
        end_col = int(end_col)
        """Make sure that begin_col <= end_col (switch if not)"""
        if begin_col > end_col:
            tmp_col = end_col
            end_col = begin_col
            begin_col = tmp_col
    else:
        begin_col = end_col = ''

    try:
        fo = open(out_file, 'w')
    except:
        print >> sys.stderr, "Unable to open output file"
        sys.exit()
    """
    len1 is the number of lines in inp1_file
    lines1 is the set of unique lines in inp1_file
    diff1 is the number of duplicate lines removed from inp1_file
    """
    len1, lines1 = get_lines(inp1_file, begin_col, end_col,
                             options.ignore_empty_end_cols)
    diff1 = len1 - len(lines1)
    len2, lines2 = get_lines(inp2_file, begin_col, end_col,
                             options.ignore_empty_end_cols)

    lines1.difference_update(lines2)
    """lines1 is now the set of unique lines in inp1_file - the set of unique lines in inp2_file"""

    for line in lines1:
        print >> fo, line

    fo.close()

    info_msg = 'Subtracted %d lines. ' % ((len1 - diff1) - len(lines1))

    if begin_col and end_col:
        info_msg += 'Restricted to columns c' + str(
            begin_col) + ' thru c' + str(end_col) + '. '

    if diff1 > 0:
        info_msg += 'Eliminated %d duplicate/blank/comment/invalid lines from first query.' % diff1

    print info_msg
Esempio n. 48
0
def __main__():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
        dbkey = options.dbkey
        output_format = options.output_format
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}
    twobits = {}
    seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
    if not os.path.exists( seq_path ):
        # If this occurs, we need to fix the metadata validator.
        stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''
    fout = open( output_filename, "w" )
    warnings = []
    warning = ''
    twobitfile = None
     
    for i, line in enumerate( open( input_filename ) ):
        line = line.rstrip( '\r\n' )
        if line and not line.startswith( "#" ):
            fields = line.split( '\t' )
            try:
                chrom = fields[chrom_col]
                start = int( fields[start_col] )
                end = int( fields[end_col] )
                if includes_strand_col:
                    strand = fields[strand_col]
            except:
                warning = "Invalid chrom, start or end column values. "
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if start > end:
                warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue

            if strand not in ['+', '-']:
                strand = '+'
            sequence = ''

            if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
                if chrom in nibs:
                    nib = nibs[chrom]
                else:
                    nibs[chrom] = nib = bx.seq.nib.NibFile( file( "%s/%s.nib" % ( seq_path, chrom ) ) )
                try:
                    sequence = nib.get( start, end-start )
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                    warnings.append( warning )
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            elif seq_path and os.path.isfile( seq_path ):
                if not(twobitfile):
                    twobitfile = bx.seq.twobit.TwoBitFile( file( seq_path ) )
                try:
                    sequence = twobitfile[chrom][start:end]
                except:
                    warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " %( start, end-start, dbkey )
                    warnings.append( warning )
                    skipped_lines += 1
                    if not invalid_line:
                        first_invalid_line = i + 1
                        invalid_line = line
                    continue
            else:
                warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if sequence == '':
                warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " %( chrom, start, end, dbkey )
                warnings.append( warning )
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
                continue
            if includes_strand_col and strand == "-":
                sequence = reverse_complement( sequence )

            if output_format == "fasta" :
                l = len( sequence )        
                c = 0
                fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
                meta_data = "_".join( fields )
                fout.write( ">%s\n" % meta_data )
                while c < l:
                    b = min( c + 50, l )
                    fout.write( "%s\n" % str( sequence[c:b] ) )
                    c = b
            else: # output_format == "interval"
                meta_data = "\t".join( fields )
                fout.write( "%s\t%s\n" % ( meta_data, str( sequence ) ) )

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len( warnings )
        warn_msg += warnings[0]
        print warn_msg
    if skipped_lines:
        print 'Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
def main():
    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg( options.cols2 )
        in1_gff_format = bool( options.gff1 )
        in2_gff_format = bool( options.gff2 )
        in_fname, in2_fname, out_fname, direction = args
    except:
        doc_optparse.exception()

    # Set readers to handle either GFF or default format.
    if in1_gff_format:
        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in1_reader_wrapper = NiceReaderWrapper
    if in2_gff_format:
        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
    else:
        in2_reader_wrapper = NiceReaderWrapper

    g1 = in1_reader_wrapper( fileinput.FileInput( in_fname ),
                             chrom_col=chr_col_1,
                             start_col=start_col_1,
                             end_col=end_col_1,
                             strand_col=strand_col_1,
                             fix_strand=True )
    g2 = in2_reader_wrapper( fileinput.FileInput( in2_fname ),
                             chrom_col=chr_col_2,
                             start_col=start_col_2,
                             end_col=end_col_2,
                             strand_col=strand_col_2,
                             fix_strand=True )

    # Find flanking features.
    out_file = open( out_fname, "w" )
    try:
        for result in proximal_region_finder([g1, g2], direction):
            if type( result ) is list:
                line, closest_feature = result
                # Need to join outputs differently depending on file types.
                if in1_gff_format:
                    # Output is GFF with added attribute 'closest feature.'

                    # Invervals are in BED coordinates; need to convert to GFF.
                    line = convert_bed_coords_to_gff( line )
                    closest_feature = convert_bed_coords_to_gff( closest_feature )

                    # Replace double quotes with single quotes in closest feature's attributes.
                    out_file.write( "%s closest_feature \"%s\" \n" %
                                    ( "\t".join( line.fields ),
                                      "\t".join( closest_feature.fields ).replace( "\"", "\\\"" )
                                      ) )
                else:
                    # Output is BED + closest feature fields.
                    output_line_fields = []
                    output_line_fields.extend( line.fields )
                    output_line_fields.extend( closest_feature.fields )
                    out_file.write( "%s\n" % ( "\t".join( output_line_fields ) ) )
            else:
                out_file.write( "%s\n" % result )
    except ParseError as exc:
        fail( "Invalid file format: %s" % str( exc ) )

    print("Direction: %s" % (direction))
    if g1.skipped > 0:
        print(skipped( g1, filedesc=" of 1st dataset" ))
    if g2.skipped > 0:
        print(skipped( g2, filedesc=" of 2nd dataset" ))
Esempio n. 50
0
def __main__():
    #
    # Parse options, args.
    #
    options, args = doc_optparse.parse( __doc__ )
    try:
        if len(options.cols.split(',')) == 5:
            # BED file
            chrom_col, start_col, end_col, strand_col, name_col = parse_cols_arg( options.cols )
        else:
            # gff file
            chrom_col, start_col, end_col, strand_col = parse_cols_arg( options.cols )
            name_col = False
        dbkey = options.dbkey
        output_format = options.output_format
        gff_format = options.gff
        interpret_features = options.interpret_features
        GALAXY_DATA_INDEX_DIR = options.GALAXY_DATA_INDEX_DIR
        fasta_file = options.fasta
        input_filename, output_filename = args
    except:
        doc_optparse.exception()

    includes_strand_col = strand_col >= 0
    strand = None
    nibs = {}

    #
    # Set path to sequence data.
    #
    if fasta_file:
        # Need to create 2bit file from fasta file.
        try:
            seq_path = tempfile.NamedTemporaryFile( dir="." ).name
            cmd = "faToTwoBit %s %s" % ( fasta_file, seq_path )

            tmp_name = tempfile.NamedTemporaryFile( dir="." ).name
            tmp_stderr = open( tmp_name, 'wb' )
            proc = subprocess.Popen( args=cmd, shell=True, stderr=tmp_stderr.fileno() )
            returncode = proc.wait()
            tmp_stderr.close()

            # Get stderr, allowing for case where it's very large.
            tmp_stderr = open( tmp_name, 'rb' )
            stderr = ''
            buffsize = 1048576
            try:
                while True:
                    stderr += tmp_stderr.read( buffsize )
                    if not stderr or len( stderr ) % buffsize != 0:
                        break
            except OverflowError:
                pass
            tmp_stderr.close()

            # Error checking.
            if returncode != 0:
                raise Exception(stderr)
        except Exception as e:
            stop_err( 'Error running faToTwoBit. ' + str( e ) )
    else:
        seq_path = check_seq_file( dbkey, GALAXY_DATA_INDEX_DIR )
        if not os.path.exists( seq_path ):
            # If this occurs, we need to fix the metadata validator.
            stop_err( "No sequences are available for '%s', request them by reporting this error." % dbkey )

    #
    # Fetch sequences.
    #

    # Get feature's line(s).
    def get_lines( feature ):
        if isinstance( feature, gff_util.GFFFeature ):
            return feature.lines()
        else:
            return [ feature.rstrip( '\r\n' ) ]

    skipped_lines = 0
    first_invalid_line = 0
    invalid_lines = []
    fout = open( output_filename, "w" )
    warnings = []
    warning = ''
    twobitfile = None
    file_iterator = open( input_filename )
    if gff_format and interpret_features:
        file_iterator = gff_util.GFFReaderWrapper( file_iterator, fix_strand=False )
    line_count = 1
    for feature in file_iterator:
        # Ignore comments, headers.
        if isinstance( feature, ( Header, Comment ) ):
            line_count += 1
            continue

        name = ""
        if gff_format and interpret_features:
            # Processing features.
            gff_util.convert_gff_coords_to_bed( feature )
            chrom = feature.chrom
            start = feature.start
            end = feature.end
            strand = feature.strand
        else:
            # Processing lines, either interval or GFF format.
            line = feature.rstrip( '\r\n' )
            if line and not line.startswith( "#" ):
                fields = line.split( '\t' )
                try:
                    chrom = fields[chrom_col]
                    start = int( fields[start_col] )
                    end = int( fields[end_col] )
                    if name_col:
                        name = fields[name_col]
                    if gff_format:
                        start, end = gff_util.convert_gff_coords_to_bed( [start, end] )
                    if includes_strand_col:
                        strand = fields[strand_col]
                except:
                    warning = "Invalid chrom, start or end column values. "
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue
                if start > end:
                    warning = "Invalid interval, start '%d' > end '%d'.  " % ( start, end )
                    warnings.append( warning )
                    if not invalid_lines:
                        invalid_lines = get_lines( feature )
                        first_invalid_line = line_count
                    skipped_lines += len( invalid_lines )
                    continue

                if strand not in ['+', '-']:
                    strand = '+'
                sequence = ''
            else:
                continue

        # Open sequence file and get sequence for feature/interval.
        if seq_path and os.path.exists( "%s/%s.nib" % ( seq_path, chrom ) ):
            # TODO: improve support for GFF-nib interaction.
            if chrom in nibs:
                nib = nibs[chrom]
            else:
                nibs[chrom] = nib = bx.seq.nib.NibFile( open( "%s/%s.nib" % ( seq_path, chrom ) ) )
            try:
                sequence = nib.get( start, end - start )
            except Exception as e:
                warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % ( start, end - start, dbkey )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        elif seq_path and os.path.isfile( seq_path ):
            if not(twobitfile):
                twobitfile = bx.seq.twobit.TwoBitFile( open( seq_path ) )
            try:
                if options.gff and interpret_features:
                    # Create sequence from intervals within a feature.
                    sequence = ''
                    for interval in feature.intervals:
                        sequence += twobitfile[interval.chrom][interval.start:interval.end]
                else:
                    sequence = twobitfile[chrom][start:end]
            except:
                warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % ( start, end - start, chrom )
                warnings.append( warning )
                if not invalid_lines:
                    invalid_lines = get_lines( feature )
                    first_invalid_line = line_count
                skipped_lines += len( invalid_lines )
                continue
        else:
            warning = "Chromosome by name '%s' was not found for build '%s'. " % ( chrom, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if sequence == '':
            warning = "Chrom: '%s', start: '%s', end: '%s' is either invalid or not present in build '%s'. " % \
                ( chrom, start, end, dbkey )
            warnings.append( warning )
            if not invalid_lines:
                invalid_lines = get_lines( feature )
                first_invalid_line = line_count
            skipped_lines += len( invalid_lines )
            continue
        if includes_strand_col and strand == "-":
            sequence = reverse_complement( sequence )

        if output_format == "fasta":
            l = len( sequence )
            c = 0
            if gff_format:
                start, end = gff_util.convert_bed_coords_to_gff( [ start, end ] )
            fields = [dbkey, str( chrom ), str( start ), str( end ), strand]
            meta_data = "_".join( fields )
            if name.strip():
                fout.write( ">%s %s\n" % (meta_data, name) )
            else:
                fout.write( ">%s\n" % meta_data )
            while c < l:
                b = min( c + 50, l )
                fout.write( "%s\n" % str( sequence[c:b] ) )
                c = b
        else:  # output_format == "interval"
            if gff_format and interpret_features:
                # TODO: need better GFF Reader to capture all information needed
                # to produce this line.
                meta_data = "\t".join(
                    [feature.chrom, "galaxy_extract_genomic_dna", "interval",
                    str( feature.start ), str( feature.end ), feature.score, feature.strand,
                    ".", gff_util.gff_attributes_to_str( feature.attributes, "GTF" ) ] )
            else:
                meta_data = "\t".join( fields )
            if gff_format:
                format_str = "%s seq \"%s\";\n"
            else:
                format_str = "%s\t%s\n"
            fout.write( format_str % ( meta_data, str( sequence ) ) )

        # Update line count.
        if isinstance( feature, gff_util.GFFFeature ):
            line_count += len( feature.intervals )
        else:
            line_count += 1

    fout.close()

    if warnings:
        warn_msg = "%d warnings, 1st is: " % len( warnings )
        warn_msg += warnings[0]
        print(warn_msg)
    if skipped_lines:
        # Error message includes up to the first 10 skipped lines.
        print('Skipped %d invalid lines, 1st is #%d, "%s"' % ( skipped_lines, first_invalid_line, '\n'.join( invalid_lines[:10] ) ))

    # Clean up temp file.
    if fasta_file:
        os.remove( seq_path )
        os.remove( tmp_name )
Esempio n. 51
0
        self.out_stream.flush()
        self.out_stream.close()


if __name__ == "__main__":
    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = [
            int(x) - 1 for x in options.cols1.split(',')
        ]
        chr_col_2, position_col_2, forward_col_2, reverse_col_2 = [
            int(x) - 1 for x in options.cols2.split(',')
        ]
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    # Sort through a tempfile first
    temp_file = tempfile.NamedTemporaryFile(mode="r")
    environ['LC_ALL'] = 'POSIX'
    commandline = "sort -f -n -k %d -k %d -k %d -o %s %s" % (
        chr_col_1 + 1, start_col_1 + 1, end_col_1 + 1, temp_file.name,
        in_fname)
    errorcode, stdout = commands.getstatusoutput(commandline)

    coverage = CoverageWriter(
        out_stream=open(out_fname, "a"),
        chromCol=chr_col_2,
        positionCol=position_col_2,
        forwardCol=forward_col_2,
        reverseCol=reverse_col_2,
def main():

    # Parse command line

    options, args = doc_optparse.parse( __doc__ )
    if options.version: return

    try:
        wiggle_file = args[0]
        # If it appears to be a bz2 file, attempt to open with table
        if wiggle_file.endswith( ".bz2" ):
            table_file = wiggle_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index bz2 compressed files first "
                                   "create a bz2t file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableBzip2File( wiggle_file, table_file )
            # Strip .bz2 from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        elif wiggle_file.endswith( ".lzo" ):
            from bx.misc.seeklzop import SeekableLzopFile
            table_file = wiggle_file + "t"
            if not os.path.exists( table_file ):
                doc_optparse.exit( "To index lzo compressed files first "
                                   "create a lzot file with bzip-table." )
            # Open with SeekableBzip2File so we have tell support
            wiggle_in = SeekableLzopFile( wiggle_file, table_file )
            # Strip .lzo from the filename before adding ".index"
            wiggle_file = wiggle_file[:-4]
        else:
            wiggle_in = open( wiggle_file )
        # Determine the name of the index file
        if len( args ) > 1:
            index_file = args[1]
        else:
            index_file = wiggle_file + ".index"
    except:
        doc_optparse.exception()

    indexes = interval_index_file.Indexes()

    # Can't use the iterator, as there is no next() and thus
    # no way to access the positions. The following code is 
    # modified from wiggle.py
    last_chrom = None
    start = None
    end = None
    first_pos = None

    # always for wiggle data
    strand = '+'

    mode = "bed"

    while 1:
        pos = wiggle_in.tell()
        line = wiggle_in.readline()
        if not line: break

        if line.isspace() or line.startswith( "track" ) or line.startswith( "#" ) or line.startswith( "browser" ):
            continue
        elif line.startswith( "bed" ):
            indexes.add( fields[0], int( fields[1] ), int( fields[2] ), pos )
        elif line.startswith( "variableStep" ) or line.startswith( "fixedStep"):
            if first_pos != None:
                indexes.add( last_chrom, start, end, first_pos )
            first_pos = pos
            header = bx.wiggle.parse_header( line )
            last_chrom = header['chrom']
            start = int(header['start']) - 1
            end = start
            current_step = None
            if 'span' in header: 
                current_span = int( header['span'] )
            else: 
                current_span = 1
            if 'step' in header: 
                current_step = int( header['step'] )

            if line.startswith( "variableStep" ):
                mode = "variableStep"
            else:
                mode = "fixedStep"
        elif mode == "variableStep":
            fields = line.split()
            end = int( fields[0] ) - 1 + current_span
        elif mode == "fixedStep":
            end += current_step
        else:
            raise "Unexpected input line: %s" % line.strip()

    out = open( index_file, 'w' )
    indexes.write( out )
    out.close()
Esempio n. 53
0
def main():
    allchroms = False
    upstream_pad = 0
    downstream_pad = 0

    options, args = doc_optparse.parse( __doc__ )
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg( options.cols1 )
        lengths = options.lengths
        if options.all: allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper( fileinput.FileInput( in_fname ),
                            chrom_col=chr_col_1,
                            start_col=start_col_1,
                            end_col=end_col_1,
                            strand_col=strand_col_1,
                            fix_strand=True )

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput( lengths )
    
    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0],"0",str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open( out_fname, "w" )

    try:
        for interval in generator:
            if type( interval ) is GenomicInterval:
                out_file.write( "%s\n" % "\t".join( interval ) )
            else:
                out_file.write( "%s\n" % interval )
    except ParseError, exc:
        out_file.close()
        fail( "Invalid file format: %s" % str( exc ) )
Esempio n. 54
0
def main():
    mincols = 1
    leftfill = False
    rightfill = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        chr_col_2, start_col_2, end_col_2, strand_col_2 = parse_cols_arg(
            options.cols2)
        if options.mincols:
            mincols = int(options.mincols)
        if options.fill:
            if options.fill == "both":
                rightfill = leftfill = True
            else:
                rightfill = options.fill == "right"
                leftfill = options.fill == "left"
        in_fname, in2_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)
    g2 = NiceReaderWrapper(fileinput.FileInput(in2_fname),
                           chrom_col=chr_col_2,
                           start_col=start_col_2,
                           end_col=end_col_2,
                           strand_col=strand_col_2,
                           fix_strand=True)

    out_file = open(out_fname, "w")

    try:
        for outfields in join(g1,
                              g2,
                              mincols=mincols,
                              rightfill=rightfill,
                              leftfill=leftfill):
            if type(outfields) is list:
                out_file.write("%s\n" % "\t".join(outfields))
            else:
                out_file.write("%s\n" % outfields)
    except ParseError as exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))
    except MemoryError:
        out_file.close()
        fail("Input datasets were too large to complete the join operation.")

    out_file.close()

    if g1.skipped > 0:
        print(skipped(g1, filedesc=" of 1st dataset"))
    if g2.skipped > 0:
        print(skipped(g2, filedesc=" of 2nd dataset"))